[Speech Recognition] - Distributed training: Make sure vocab file removal and creation don't interfer (#14161)

* up * better
2025-08-01 18:51:14 +06:00 · 2021-10-26 15:59:33 +02:00 · 2021-10-26 15:59:33 +02:00 · f5ed19f57d
commit f5ed19f57d
parent 840fc8dbca
1 changed files with 9 additions and 9 deletions
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@ -395,20 +395,20 @@ def main():
    # the training and evaluation datasets
    # We need to make sure that only first rank saves vocabulary
    # make sure all processes wait until vocab is created
    vocab_file = os.path.join(training_args.output_dir, "vocab.json")
-    with training_args.main_process_first(desc="dataset map vocabulary creation"):
+    with training_args.main_process_first():
        vocab_dict = create_vocabulary_from_data(raw_datasets)
        vocab_file = os.path.join(training_args.output_dir, "vocab.json")
        # save vocab dict to be loaded into tokenizer
        os.makedirs(training_args.output_dir, exist_ok=True)
        if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
            os.remove(vocab_file)
    with training_args.main_process_first(desc="dataset map vocabulary creation"):
        if not os.path.isfile(vocab_file):
-            with open(vocab_file, "w") as vocab_file:
+            os.makedirs(training_args.output_dir, exist_ok=True)
-                json.dump(vocab_dict, vocab_file)
+            vocab_dict = create_vocabulary_from_data(raw_datasets)
            # save vocab dict to be loaded into tokenizer
            with open(vocab_file, "w") as file:
                json.dump(vocab_dict, file)
    # 4. Now we can instantiate the configuration, feature extractor, tokenizer and model
    # Note for distributed training, the .from_pretrained methods guarantee that only