From 1b1867d86b9203ee8b9d41bed5d6224c7f407a18 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Tue, 11 Apr 2023 09:32:20 -0400 Subject: [PATCH] Replace -100s in predictions by the pad token (#22693) * Replace -100s in predictions by the pad token * Style * Try to catch them all --- examples/pytorch/question-answering/run_seq2seq_qa.py | 3 +++ examples/pytorch/summarization/run_summarization.py | 10 ++++++---- examples/pytorch/translation/run_translation.py | 10 ++++++---- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py index 00a30685e53..da56580472f 100644 --- a/examples/pytorch/question-answering/run_seq2seq_qa.py +++ b/examples/pytorch/question-answering/run_seq2seq_qa.py @@ -26,6 +26,7 @@ from typing import List, Optional, Tuple import datasets import evaluate +import numpy as np from datasets import load_dataset from trainer_seq2seq_qa import QuestionAnsweringSeq2SeqTrainer @@ -614,6 +615,8 @@ def main(): preds = outputs.predictions if isinstance(preds, tuple): preds = preds[0] + # Replace -100s used for padding as we can't decode them + preds = np.where(preds != -100, preds, tokenizer.pad_token_id) decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) # Build a map example to its corresponding features. diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index 587ff5b770e..c2e0a6828c8 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -632,10 +632,10 @@ def main(): preds, labels = eval_preds if isinstance(preds, tuple): preds = preds[0] + # Replace -100s used for padding as we can't decode them + preds = np.where(preds != -100, preds, tokenizer.pad_token_id) decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) - if data_args.ignore_pad_token_for_loss: - # Replace -100 in the labels as we can't decode them. - labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Some simple post-processing @@ -714,8 +714,10 @@ def main(): if trainer.is_world_process_zero(): if training_args.predict_with_generate: + predictions = predict_results.predictions + predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id) predictions = tokenizer.batch_decode( - predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True + predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True ) predictions = [pred.strip() for pred in predictions] output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt") diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index 19d7c587bfd..9cac8736175 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -543,10 +543,10 @@ def main(): preds, labels = eval_preds if isinstance(preds, tuple): preds = preds[0] + # Replace -100s used for padding as we can't decode them + preds = np.where(preds != -100, preds, tokenizer.pad_token_id) decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) - if data_args.ignore_pad_token_for_loss: - # Replace -100 in the labels as we can't decode them. - labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Some simple post-processing @@ -626,8 +626,10 @@ def main(): if trainer.is_world_process_zero(): if training_args.predict_with_generate: + predictions = predict_results.predictions + predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id) predictions = tokenizer.batch_decode( - predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True + predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True ) predictions = [pred.strip() for pred in predictions] output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt")