mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-04 13:20:12 +06:00
fix bug in distributed loss test (#38166)
* fix bug in distributed loss test and change some config to pass at both 2&8 gpus * fix doc
This commit is contained in:
parent
a4389494c7
commit
ea29f61ed9
@ -3784,7 +3784,7 @@ class Trainer:
|
|||||||
with amp.scale_loss(loss, self.optimizer) as scaled_loss:
|
with amp.scale_loss(loss, self.optimizer) as scaled_loss:
|
||||||
scaled_loss.backward()
|
scaled_loss.backward()
|
||||||
else:
|
else:
|
||||||
# Finally we need to normalize the loss for reporting
|
# Finally we need to normalize the loss for reporting if GA loss bug is not fixed during compute loss
|
||||||
if not self.model_accepts_loss_kwargs and self.compute_loss_func is None:
|
if not self.model_accepts_loss_kwargs and self.compute_loss_func is None:
|
||||||
loss = loss / self.args.gradient_accumulation_steps
|
loss = loss / self.args.gradient_accumulation_steps
|
||||||
|
|
||||||
|
@ -26,7 +26,7 @@ class TestTrainerDistributedLoss(TestCasePlus):
|
|||||||
@require_torch_multi_accelerator
|
@require_torch_multi_accelerator
|
||||||
def test_trainer(self):
|
def test_trainer(self):
|
||||||
device_count = backend_device_count(torch_device)
|
device_count = backend_device_count(torch_device)
|
||||||
min_bs = 1
|
min_bs = 2
|
||||||
output_dir = self.get_auto_remove_tmp_dir()
|
output_dir = self.get_auto_remove_tmp_dir()
|
||||||
for gpu_num, enable, bs, name in (
|
for gpu_num, enable, bs, name in (
|
||||||
(1, True, min_bs * device_count, "base"),
|
(1, True, min_bs * device_count, "base"),
|
||||||
@ -50,9 +50,10 @@ class TestTrainerDistributedLoss(TestCasePlus):
|
|||||||
broken_diff = [abs(base_loss[i] - broken_loss[i]) for i in range(len(base_loss))]
|
broken_diff = [abs(base_loss[i] - broken_loss[i]) for i in range(len(base_loss))]
|
||||||
fixed_diff = [abs(base_loss[i] - fixed_loss[i]) for i in range(len(base_loss))]
|
fixed_diff = [abs(base_loss[i] - fixed_loss[i]) for i in range(len(base_loss))]
|
||||||
sum_base = sum(base_loss)
|
sum_base = sum(base_loss)
|
||||||
sum_broken = sum(broken_diff)
|
sum_broken = sum(broken_loss)
|
||||||
relative_broken = abs(sum_base - sum_broken) / max(sum_base, sum_broken)
|
relative_broken = abs(sum_base - sum_broken) / max(sum_base, sum_broken)
|
||||||
|
|
||||||
|
# the gap may be smaller for other models, but it still ok.
|
||||||
self.assertGreater(max(broken_diff), 0.5)
|
self.assertGreater(max(broken_diff), 0.5)
|
||||||
self.assertLess(max(fixed_diff), 0.005)
|
self.assertLess(max(fixed_diff), 0.005)
|
||||||
self.assertLess(relative_broken, 0.1)
|
self.assertLess(relative_broken, 0.1)
|
||||||
@ -63,7 +64,7 @@ def run_distributed_training(training_args):
|
|||||||
model_name = "nickypro/tinyllama-15M"
|
model_name = "nickypro/tinyllama-15M"
|
||||||
dataset_name = "wikitext"
|
dataset_name = "wikitext"
|
||||||
dataset_config = "wikitext-2-raw-v1"
|
dataset_config = "wikitext-2-raw-v1"
|
||||||
dataset = datasets.load_dataset(dataset_name, dataset_config, split="train[:17]")
|
dataset = datasets.load_dataset(dataset_name, dataset_config, split="train[:100]")
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user