diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 7df1fa6a930..0ca6f6af6e5 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -3784,7 +3784,7 @@ class Trainer: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: - # Finally we need to normalize the loss for reporting + # Finally we need to normalize the loss for reporting if GA loss bug is not fixed during compute loss if not self.model_accepts_loss_kwargs and self.compute_loss_func is None: loss = loss / self.args.gradient_accumulation_steps diff --git a/tests/trainer/test_trainer_distributed_loss.py b/tests/trainer/test_trainer_distributed_loss.py index 9bae7c92657..405763125ec 100644 --- a/tests/trainer/test_trainer_distributed_loss.py +++ b/tests/trainer/test_trainer_distributed_loss.py @@ -26,7 +26,7 @@ class TestTrainerDistributedLoss(TestCasePlus): @require_torch_multi_accelerator def test_trainer(self): device_count = backend_device_count(torch_device) - min_bs = 1 + min_bs = 2 output_dir = self.get_auto_remove_tmp_dir() for gpu_num, enable, bs, name in ( (1, True, min_bs * device_count, "base"), @@ -50,9 +50,10 @@ class TestTrainerDistributedLoss(TestCasePlus): broken_diff = [abs(base_loss[i] - broken_loss[i]) for i in range(len(base_loss))] fixed_diff = [abs(base_loss[i] - fixed_loss[i]) for i in range(len(base_loss))] sum_base = sum(base_loss) - sum_broken = sum(broken_diff) + sum_broken = sum(broken_loss) relative_broken = abs(sum_base - sum_broken) / max(sum_base, sum_broken) + # the gap may be smaller for other models, but it still ok. self.assertGreater(max(broken_diff), 0.5) self.assertLess(max(fixed_diff), 0.005) self.assertLess(relative_broken, 0.1) @@ -63,7 +64,7 @@ def run_distributed_training(training_args): model_name = "nickypro/tinyllama-15M" dataset_name = "wikitext" dataset_config = "wikitext-2-raw-v1" - dataset = datasets.load_dataset(dataset_name, dataset_config, split="train[:17]") + dataset = datasets.load_dataset(dataset_name, dataset_config, split="train[:100]") tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer.pad_token = tokenizer.eos_token