mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-01 18:51:14 +06:00
[Speech Recognition] - Distributed training: Make sure vocab file removal and creation don't interfer (#14161)
* up * better
This commit is contained in:
parent
840fc8dbca
commit
f5ed19f57d
@ -395,20 +395,20 @@ def main():
|
|||||||
# the training and evaluation datasets
|
# the training and evaluation datasets
|
||||||
# We need to make sure that only first rank saves vocabulary
|
# We need to make sure that only first rank saves vocabulary
|
||||||
# make sure all processes wait until vocab is created
|
# make sure all processes wait until vocab is created
|
||||||
|
vocab_file = os.path.join(training_args.output_dir, "vocab.json")
|
||||||
|
|
||||||
with training_args.main_process_first(desc="dataset map vocabulary creation"):
|
with training_args.main_process_first():
|
||||||
vocab_dict = create_vocabulary_from_data(raw_datasets)
|
|
||||||
|
|
||||||
vocab_file = os.path.join(training_args.output_dir, "vocab.json")
|
|
||||||
|
|
||||||
# save vocab dict to be loaded into tokenizer
|
|
||||||
os.makedirs(training_args.output_dir, exist_ok=True)
|
|
||||||
if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
|
if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
|
||||||
os.remove(vocab_file)
|
os.remove(vocab_file)
|
||||||
|
|
||||||
|
with training_args.main_process_first(desc="dataset map vocabulary creation"):
|
||||||
if not os.path.isfile(vocab_file):
|
if not os.path.isfile(vocab_file):
|
||||||
with open(vocab_file, "w") as vocab_file:
|
os.makedirs(training_args.output_dir, exist_ok=True)
|
||||||
json.dump(vocab_dict, vocab_file)
|
vocab_dict = create_vocabulary_from_data(raw_datasets)
|
||||||
|
|
||||||
|
# save vocab dict to be loaded into tokenizer
|
||||||
|
with open(vocab_file, "w") as file:
|
||||||
|
json.dump(vocab_dict, file)
|
||||||
|
|
||||||
# 4. Now we can instantiate the configuration, feature extractor, tokenizer and model
|
# 4. Now we can instantiate the configuration, feature extractor, tokenizer and model
|
||||||
# Note for distributed training, the .from_pretrained methods guarantee that only
|
# Note for distributed training, the .from_pretrained methods guarantee that only
|
||||||
|
Loading…
Reference in New Issue
Block a user