mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-02 19:21:31 +06:00
[Examples] Replicates the new --log_level feature to all trainer-based pytorch (#12359)
* added log_level * fix comment * fixed log_level * Trigger CI * Unfied logging * simplified args for log_level
This commit is contained in:
parent
64e6098094
commit
539ee456d4
@ -28,6 +28,7 @@ import sys
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
from datasets import load_dataset
|
||||
|
||||
import transformers
|
||||
@ -203,18 +204,19 @@ def main():
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
handlers=[logging.StreamHandler(sys.stdout)],
|
||||
)
|
||||
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
|
||||
|
||||
log_level = training_args.get_process_log_level()
|
||||
logger.setLevel(log_level)
|
||||
datasets.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
|
||||
# Log on each process the small summary:
|
||||
logger.warning(
|
||||
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
|
||||
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
|
||||
)
|
||||
# Set the verbosity to info of the Transformers logger (on main process only):
|
||||
if training_args.should_log:
|
||||
transformers.utils.logging.set_verbosity_info()
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
logger.info(f"Training/evaluation parameters {training_args}")
|
||||
|
||||
# Detecting last checkpoint.
|
||||
@ -246,15 +248,17 @@ def main():
|
||||
# download the dataset.
|
||||
if data_args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
|
||||
if "validation" not in datasets.keys():
|
||||
datasets["validation"] = load_dataset(
|
||||
raw_datasets = load_dataset(
|
||||
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
|
||||
)
|
||||
if "validation" not in raw_datasets.keys():
|
||||
raw_datasets["validation"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
data_args.dataset_config_name,
|
||||
split=f"train[:{data_args.validation_split_percentage}%]",
|
||||
cache_dir=model_args.cache_dir,
|
||||
)
|
||||
datasets["train"] = load_dataset(
|
||||
raw_datasets["train"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
data_args.dataset_config_name,
|
||||
split=f"train[{data_args.validation_split_percentage}%:]",
|
||||
@ -273,7 +277,7 @@ def main():
|
||||
)
|
||||
if extension == "txt":
|
||||
extension = "text"
|
||||
datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
||||
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
|
||||
@ -334,9 +338,9 @@ def main():
|
||||
# Preprocessing the datasets.
|
||||
# First we tokenize all the texts.
|
||||
if training_args.do_train:
|
||||
column_names = datasets["train"].column_names
|
||||
column_names = raw_datasets["train"].column_names
|
||||
else:
|
||||
column_names = datasets["validation"].column_names
|
||||
column_names = raw_datasets["validation"].column_names
|
||||
text_column_name = "text" if "text" in column_names else column_names[0]
|
||||
|
||||
# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
|
||||
@ -352,7 +356,7 @@ def main():
|
||||
)
|
||||
return output
|
||||
|
||||
tokenized_datasets = datasets.map(
|
||||
tokenized_datasets = raw_datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
|
@ -28,6 +28,7 @@ import sys
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
from datasets import load_dataset
|
||||
|
||||
import transformers
|
||||
@ -212,7 +213,13 @@ def main():
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
handlers=[logging.StreamHandler(sys.stdout)],
|
||||
)
|
||||
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
|
||||
|
||||
log_level = training_args.get_process_log_level()
|
||||
logger.setLevel(log_level)
|
||||
datasets.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
|
||||
# Log on each process the small summary:
|
||||
logger.warning(
|
||||
@ -220,10 +227,6 @@ def main():
|
||||
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
|
||||
)
|
||||
# Set the verbosity to info of the Transformers logger (on main process only):
|
||||
if training_args.should_log:
|
||||
transformers.utils.logging.set_verbosity_info()
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
logger.info(f"Training/evaluation parameters {training_args}")
|
||||
|
||||
# Detecting last checkpoint.
|
||||
@ -255,15 +258,17 @@ def main():
|
||||
# download the dataset.
|
||||
if data_args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
|
||||
if "validation" not in datasets.keys():
|
||||
datasets["validation"] = load_dataset(
|
||||
raw_datasets = load_dataset(
|
||||
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
|
||||
)
|
||||
if "validation" not in raw_datasets.keys():
|
||||
raw_datasets["validation"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
data_args.dataset_config_name,
|
||||
split=f"train[:{data_args.validation_split_percentage}%]",
|
||||
cache_dir=model_args.cache_dir,
|
||||
)
|
||||
datasets["train"] = load_dataset(
|
||||
raw_datasets["train"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
data_args.dataset_config_name,
|
||||
split=f"train[{data_args.validation_split_percentage}%:]",
|
||||
@ -278,7 +283,7 @@ def main():
|
||||
extension = data_args.train_file.split(".")[-1]
|
||||
if extension == "txt":
|
||||
extension = "text"
|
||||
datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
||||
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
|
||||
@ -337,9 +342,9 @@ def main():
|
||||
# Preprocessing the datasets.
|
||||
# First we tokenize all the texts.
|
||||
if training_args.do_train:
|
||||
column_names = datasets["train"].column_names
|
||||
column_names = raw_datasets["train"].column_names
|
||||
else:
|
||||
column_names = datasets["validation"].column_names
|
||||
column_names = raw_datasets["validation"].column_names
|
||||
text_column_name = "text" if "text" in column_names else column_names[0]
|
||||
|
||||
if data_args.max_seq_length is None:
|
||||
@ -377,7 +382,7 @@ def main():
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
|
||||
tokenized_datasets = datasets.map(
|
||||
tokenized_datasets = raw_datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
@ -392,7 +397,7 @@ def main():
|
||||
def tokenize_function(examples):
|
||||
return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
|
||||
|
||||
tokenized_datasets = datasets.map(
|
||||
tokenized_datasets = raw_datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
|
@ -25,6 +25,7 @@ import sys
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
from datasets import load_dataset
|
||||
|
||||
import transformers
|
||||
@ -209,18 +210,19 @@ def main():
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
handlers=[logging.StreamHandler(sys.stdout)],
|
||||
)
|
||||
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
|
||||
|
||||
log_level = training_args.get_process_log_level()
|
||||
logger.setLevel(log_level)
|
||||
datasets.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
|
||||
# Log on each process the small summary:
|
||||
logger.warning(
|
||||
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
|
||||
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
|
||||
)
|
||||
# Set the verbosity to info of the Transformers logger (on main process only):
|
||||
if training_args.should_log:
|
||||
transformers.utils.logging.set_verbosity_info()
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
logger.info(f"Training/evaluation parameters {training_args}")
|
||||
|
||||
# Detecting last checkpoint.
|
||||
@ -252,15 +254,17 @@ def main():
|
||||
# download the dataset.
|
||||
if data_args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
|
||||
if "validation" not in datasets.keys():
|
||||
datasets["validation"] = load_dataset(
|
||||
raw_datasets = load_dataset(
|
||||
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
|
||||
)
|
||||
if "validation" not in raw_datasets.keys():
|
||||
raw_datasets["validation"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
data_args.dataset_config_name,
|
||||
split=f"train[:{data_args.validation_split_percentage}%]",
|
||||
cache_dir=model_args.cache_dir,
|
||||
)
|
||||
datasets["train"] = load_dataset(
|
||||
raw_datasets["train"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
data_args.dataset_config_name,
|
||||
split=f"train[{data_args.validation_split_percentage}%:]",
|
||||
@ -275,7 +279,7 @@ def main():
|
||||
extension = data_args.train_file.split(".")[-1]
|
||||
if extension == "txt":
|
||||
extension = "text"
|
||||
datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
||||
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
|
||||
@ -334,9 +338,9 @@ def main():
|
||||
# Preprocessing the datasets.
|
||||
# First we tokenize all the texts.
|
||||
if training_args.do_train:
|
||||
column_names = datasets["train"].column_names
|
||||
column_names = raw_datasets["train"].column_names
|
||||
else:
|
||||
column_names = datasets["validation"].column_names
|
||||
column_names = raw_datasets["validation"].column_names
|
||||
text_column_name = "text" if "text" in column_names else column_names[0]
|
||||
|
||||
if data_args.max_seq_length > tokenizer.model_max_length:
|
||||
@ -355,7 +359,7 @@ def main():
|
||||
examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
|
||||
return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length)
|
||||
|
||||
tokenized_datasets = datasets.map(
|
||||
tokenized_datasets = raw_datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
@ -368,7 +372,7 @@ def main():
|
||||
def tokenize_function(examples):
|
||||
return tokenizer(examples[text_column_name])
|
||||
|
||||
tokenized_datasets = datasets.map(
|
||||
tokenized_datasets = raw_datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
|
@ -24,6 +24,7 @@ import sys
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional, Union
|
||||
|
||||
import datasets
|
||||
import numpy as np
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
@ -220,18 +221,18 @@ def main():
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
handlers=[logging.StreamHandler(sys.stdout)],
|
||||
)
|
||||
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
|
||||
log_level = training_args.get_process_log_level()
|
||||
logger.setLevel(log_level)
|
||||
datasets.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
|
||||
# Log on each process the small summary:
|
||||
logger.warning(
|
||||
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
|
||||
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
|
||||
)
|
||||
# Set the verbosity to info of the Transformers logger (on main process only):
|
||||
if training_args.should_log:
|
||||
transformers.utils.logging.set_verbosity_info()
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
logger.info(f"Training/evaluation parameters {training_args}")
|
||||
|
||||
# Detecting last checkpoint.
|
||||
@ -268,10 +269,10 @@ def main():
|
||||
if data_args.validation_file is not None:
|
||||
data_files["validation"] = data_args.validation_file
|
||||
extension = data_args.train_file.split(".")[-1]
|
||||
datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
||||
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
||||
else:
|
||||
# Downloading and loading the swag dataset from the hub.
|
||||
datasets = load_dataset("swag", "regular", cache_dir=model_args.cache_dir)
|
||||
raw_datasets = load_dataset("swag", "regular", cache_dir=model_args.cache_dir)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
|
||||
@ -347,9 +348,9 @@ def main():
|
||||
return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
|
||||
|
||||
if training_args.do_train:
|
||||
if "train" not in datasets:
|
||||
if "train" not in raw_datasets:
|
||||
raise ValueError("--do_train requires a train dataset")
|
||||
train_dataset = datasets["train"]
|
||||
train_dataset = raw_datasets["train"]
|
||||
if data_args.max_train_samples is not None:
|
||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||
train_dataset = train_dataset.map(
|
||||
@ -360,9 +361,9 @@ def main():
|
||||
)
|
||||
|
||||
if training_args.do_eval:
|
||||
if "validation" not in datasets:
|
||||
if "validation" not in raw_datasets:
|
||||
raise ValueError("--do_eval requires a validation dataset")
|
||||
eval_dataset = datasets["validation"]
|
||||
eval_dataset = raw_datasets["validation"]
|
||||
if data_args.max_eval_samples is not None:
|
||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
||||
eval_dataset = eval_dataset.map(
|
||||
|
@ -24,6 +24,7 @@ import sys
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
from datasets import load_dataset, load_metric
|
||||
|
||||
import transformers
|
||||
@ -216,18 +217,19 @@ def main():
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
handlers=[logging.StreamHandler(sys.stdout)],
|
||||
)
|
||||
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
|
||||
|
||||
log_level = training_args.get_process_log_level()
|
||||
logger.setLevel(log_level)
|
||||
datasets.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
|
||||
# Log on each process the small summary:
|
||||
logger.warning(
|
||||
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
|
||||
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
|
||||
)
|
||||
# Set the verbosity to info of the Transformers logger (on main process only):
|
||||
if training_args.should_log:
|
||||
transformers.utils.logging.set_verbosity_info()
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
logger.info(f"Training/evaluation parameters {training_args}")
|
||||
|
||||
# Detecting last checkpoint.
|
||||
@ -259,7 +261,9 @@ def main():
|
||||
# download the dataset.
|
||||
if data_args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
|
||||
raw_datasets = load_dataset(
|
||||
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
if data_args.train_file is not None:
|
||||
@ -272,7 +276,7 @@ def main():
|
||||
if data_args.test_file is not None:
|
||||
data_files["test"] = data_args.test_file
|
||||
extension = data_args.test_file.split(".")[-1]
|
||||
datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
|
||||
raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
|
||||
@ -314,11 +318,11 @@ def main():
|
||||
# Preprocessing the datasets.
|
||||
# Preprocessing is slighlty different for training and evaluation.
|
||||
if training_args.do_train:
|
||||
column_names = datasets["train"].column_names
|
||||
column_names = raw_datasets["train"].column_names
|
||||
elif training_args.do_eval:
|
||||
column_names = datasets["validation"].column_names
|
||||
column_names = raw_datasets["validation"].column_names
|
||||
else:
|
||||
column_names = datasets["test"].column_names
|
||||
column_names = raw_datasets["test"].column_names
|
||||
question_column_name = "question" if "question" in column_names else column_names[0]
|
||||
context_column_name = "context" if "context" in column_names else column_names[1]
|
||||
answer_column_name = "answers" if "answers" in column_names else column_names[2]
|
||||
@ -407,9 +411,9 @@ def main():
|
||||
return tokenized_examples
|
||||
|
||||
if training_args.do_train:
|
||||
if "train" not in datasets:
|
||||
if "train" not in raw_datasets:
|
||||
raise ValueError("--do_train requires a train dataset")
|
||||
train_dataset = datasets["train"]
|
||||
train_dataset = raw_datasets["train"]
|
||||
if data_args.max_train_samples is not None:
|
||||
# We will select sample from whole data if agument is specified
|
||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||
@ -469,9 +473,9 @@ def main():
|
||||
return tokenized_examples
|
||||
|
||||
if training_args.do_eval:
|
||||
if "validation" not in datasets:
|
||||
if "validation" not in raw_datasets:
|
||||
raise ValueError("--do_eval requires a validation dataset")
|
||||
eval_examples = datasets["validation"]
|
||||
eval_examples = raw_datasets["validation"]
|
||||
if data_args.max_eval_samples is not None:
|
||||
# We will select sample from whole data
|
||||
eval_examples = eval_examples.select(range(data_args.max_eval_samples))
|
||||
@ -489,9 +493,9 @@ def main():
|
||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
||||
|
||||
if training_args.do_predict:
|
||||
if "test" not in datasets:
|
||||
if "test" not in raw_datasets:
|
||||
raise ValueError("--do_predict requires a test dataset")
|
||||
predict_examples = datasets["test"]
|
||||
predict_examples = raw_datasets["test"]
|
||||
if data_args.max_predict_samples is not None:
|
||||
# We will select sample from whole data
|
||||
predict_examples = predict_examples.select(range(data_args.max_predict_samples))
|
||||
@ -529,7 +533,7 @@ def main():
|
||||
max_answer_length=data_args.max_answer_length,
|
||||
null_score_diff_threshold=data_args.null_score_diff_threshold,
|
||||
output_dir=training_args.output_dir,
|
||||
is_world_process_zero=trainer.is_world_process_zero(),
|
||||
log_level=log_level,
|
||||
prefix=stage,
|
||||
)
|
||||
# Format the result to the format the metric expects.
|
||||
|
@ -24,6 +24,7 @@ import sys
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
from datasets import load_dataset, load_metric
|
||||
|
||||
import transformers
|
||||
@ -215,18 +216,18 @@ def main():
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
handlers=[logging.StreamHandler(sys.stdout)],
|
||||
)
|
||||
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
|
||||
log_level = training_args.get_process_log_level()
|
||||
logger.setLevel(log_level)
|
||||
datasets.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
|
||||
# Log on each process the small summary:
|
||||
logger.warning(
|
||||
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
|
||||
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
|
||||
)
|
||||
# Set the verbosity to info of the Transformers logger (on main process only):
|
||||
if training_args.should_log:
|
||||
transformers.utils.logging.set_verbosity_info()
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
logger.info(f"Training/evaluation parameters {training_args}")
|
||||
|
||||
# Detecting last checkpoint.
|
||||
@ -258,7 +259,9 @@ def main():
|
||||
# download the dataset.
|
||||
if data_args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
|
||||
raw_datasets = load_dataset(
|
||||
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
if data_args.train_file is not None:
|
||||
@ -270,7 +273,7 @@ def main():
|
||||
if data_args.test_file is not None:
|
||||
data_files["test"] = data_args.test_file
|
||||
extension = data_args.test_file.split(".")[-1]
|
||||
datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
|
||||
raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
|
||||
@ -303,11 +306,11 @@ def main():
|
||||
# Preprocessing the datasets.
|
||||
# Preprocessing is slighlty different for training and evaluation.
|
||||
if training_args.do_train:
|
||||
column_names = datasets["train"].column_names
|
||||
column_names = raw_datasets["train"].column_names
|
||||
elif training_args.do_eval:
|
||||
column_names = datasets["validation"].column_names
|
||||
column_names = raw_datasets["validation"].column_names
|
||||
else:
|
||||
column_names = datasets["test"].column_names
|
||||
column_names = raw_datasets["test"].column_names
|
||||
question_column_name = "question" if "question" in column_names else column_names[0]
|
||||
context_column_name = "context" if "context" in column_names else column_names[1]
|
||||
answer_column_name = "answers" if "answers" in column_names else column_names[2]
|
||||
@ -419,9 +422,9 @@ def main():
|
||||
return tokenized_examples
|
||||
|
||||
if training_args.do_train:
|
||||
if "train" not in datasets:
|
||||
if "train" not in raw_datasets:
|
||||
raise ValueError("--do_train requires a train dataset")
|
||||
train_dataset = datasets["train"]
|
||||
train_dataset = raw_datasets["train"]
|
||||
if data_args.max_train_samples is not None:
|
||||
# Select samples from Dataset, This will help to decrease processing time
|
||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||
@ -505,9 +508,9 @@ def main():
|
||||
return tokenized_examples
|
||||
|
||||
if training_args.do_eval:
|
||||
if "validation" not in datasets:
|
||||
if "validation" not in raw_datasets:
|
||||
raise ValueError("--do_eval requires a validation dataset")
|
||||
eval_examples = datasets["validation"]
|
||||
eval_examples = raw_datasets["validation"]
|
||||
if data_args.max_eval_samples is not None:
|
||||
# Selecting Eval Samples from Dataset
|
||||
eval_examples = eval_examples.select(range(data_args.max_eval_samples))
|
||||
@ -525,9 +528,9 @@ def main():
|
||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
||||
|
||||
if training_args.do_predict:
|
||||
if "test" not in datasets:
|
||||
if "test" not in raw_datasets:
|
||||
raise ValueError("--do_predict requires a test dataset")
|
||||
predict_examples = datasets["test"]
|
||||
predict_examples = raw_datasets["test"]
|
||||
if data_args.max_predict_samples is not None:
|
||||
# We will select sample from whole data
|
||||
predict_examples = predict_examples.select(range(data_args.max_predict_samples))
|
||||
@ -566,7 +569,7 @@ def main():
|
||||
start_n_top=model.config.start_n_top,
|
||||
end_n_top=model.config.end_n_top,
|
||||
output_dir=training_args.output_dir,
|
||||
is_world_process_zero=trainer.is_world_process_zero(),
|
||||
log_level=log_level,
|
||||
prefix=stage,
|
||||
)
|
||||
# Format the result to the format the metric expects.
|
||||
|
@ -38,7 +38,7 @@ def postprocess_qa_predictions(
|
||||
null_score_diff_threshold: float = 0.0,
|
||||
output_dir: Optional[str] = None,
|
||||
prefix: Optional[str] = None,
|
||||
is_world_process_zero: bool = True,
|
||||
log_level: Optional[int] = logging.WARNING,
|
||||
):
|
||||
"""
|
||||
Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
|
||||
@ -70,8 +70,8 @@ def postprocess_qa_predictions(
|
||||
answers, are saved in `output_dir`.
|
||||
prefix (:obj:`str`, `optional`):
|
||||
If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
|
||||
is_world_process_zero (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether this process is the main process or not (used to determine if logging/saves should be done).
|
||||
log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
|
||||
``logging`` log level (e.g., ``logging.WARNING``)
|
||||
"""
|
||||
assert len(predictions) == 2, "`predictions` should be a tuple with two elements (start_logits, end_logits)."
|
||||
all_start_logits, all_end_logits = predictions
|
||||
@ -91,7 +91,7 @@ def postprocess_qa_predictions(
|
||||
scores_diff_json = collections.OrderedDict()
|
||||
|
||||
# Logging.
|
||||
logger.setLevel(logging.INFO if is_world_process_zero else logging.WARN)
|
||||
logger.setLevel(log_level)
|
||||
logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
|
||||
|
||||
# Let's loop over all the examples!
|
||||
@ -250,7 +250,7 @@ def postprocess_qa_predictions_with_beam_search(
|
||||
end_n_top: int = 5,
|
||||
output_dir: Optional[str] = None,
|
||||
prefix: Optional[str] = None,
|
||||
is_world_process_zero: bool = True,
|
||||
log_level: Optional[int] = logging.WARNING,
|
||||
):
|
||||
"""
|
||||
Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the
|
||||
@ -280,8 +280,8 @@ def postprocess_qa_predictions_with_beam_search(
|
||||
answers, are saved in `output_dir`.
|
||||
prefix (:obj:`str`, `optional`):
|
||||
If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
|
||||
is_world_process_zero (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether this process is the main process or not (used to determine if logging/saves should be done).
|
||||
log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
|
||||
``logging`` log level (e.g., ``logging.WARNING``)
|
||||
"""
|
||||
assert len(predictions) == 5, "`predictions` should be a tuple with five elements."
|
||||
start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions
|
||||
@ -302,7 +302,7 @@ def postprocess_qa_predictions_with_beam_search(
|
||||
scores_diff_json = collections.OrderedDict() if version_2_with_negative else None
|
||||
|
||||
# Logging.
|
||||
logger.setLevel(logging.INFO if is_world_process_zero else logging.WARN)
|
||||
logger.setLevel(log_level)
|
||||
logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
|
||||
|
||||
# Let's loop over all the examples!
|
||||
|
@ -24,6 +24,7 @@ import sys
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
import nltk # Here to have a nice missing dependency error message early on
|
||||
import numpy as np
|
||||
from datasets import load_dataset, load_metric
|
||||
@ -260,16 +261,18 @@ def main():
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
handlers=[logging.StreamHandler(sys.stdout)],
|
||||
)
|
||||
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
|
||||
log_level = training_args.get_process_log_level()
|
||||
logger.setLevel(log_level)
|
||||
datasets.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
|
||||
# Log on each process the small summary:
|
||||
logger.warning(
|
||||
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
|
||||
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
|
||||
)
|
||||
# Set the verbosity to info of the Transformers logger (on main process only):
|
||||
if training_args.should_log:
|
||||
transformers.utils.logging.set_verbosity_info()
|
||||
logger.info(f"Training/evaluation parameters {training_args}")
|
||||
|
||||
if data_args.source_prefix is None and model_args.model_name_or_path in [
|
||||
@ -313,7 +316,9 @@ def main():
|
||||
# download the dataset.
|
||||
if data_args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
|
||||
raw_datasets = load_dataset(
|
||||
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
if data_args.train_file is not None:
|
||||
@ -325,7 +330,7 @@ def main():
|
||||
if data_args.test_file is not None:
|
||||
data_files["test"] = data_args.test_file
|
||||
extension = data_args.test_file.split(".")[-1]
|
||||
datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
||||
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
|
||||
@ -366,11 +371,11 @@ def main():
|
||||
# Preprocessing the datasets.
|
||||
# We need to tokenize inputs and targets.
|
||||
if training_args.do_train:
|
||||
column_names = datasets["train"].column_names
|
||||
column_names = raw_datasets["train"].column_names
|
||||
elif training_args.do_eval:
|
||||
column_names = datasets["validation"].column_names
|
||||
column_names = raw_datasets["validation"].column_names
|
||||
elif training_args.do_predict:
|
||||
column_names = datasets["test"].column_names
|
||||
column_names = raw_datasets["test"].column_names
|
||||
else:
|
||||
logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
|
||||
return
|
||||
@ -425,9 +430,9 @@ def main():
|
||||
return model_inputs
|
||||
|
||||
if training_args.do_train:
|
||||
if "train" not in datasets:
|
||||
if "train" not in raw_datasets:
|
||||
raise ValueError("--do_train requires a train dataset")
|
||||
train_dataset = datasets["train"]
|
||||
train_dataset = raw_datasets["train"]
|
||||
if data_args.max_train_samples is not None:
|
||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||
train_dataset = train_dataset.map(
|
||||
@ -441,9 +446,9 @@ def main():
|
||||
|
||||
if training_args.do_eval:
|
||||
max_target_length = data_args.val_max_target_length
|
||||
if "validation" not in datasets:
|
||||
if "validation" not in raw_datasets:
|
||||
raise ValueError("--do_eval requires a validation dataset")
|
||||
eval_dataset = datasets["validation"]
|
||||
eval_dataset = raw_datasets["validation"]
|
||||
if data_args.max_eval_samples is not None:
|
||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
||||
eval_dataset = eval_dataset.map(
|
||||
@ -457,9 +462,9 @@ def main():
|
||||
|
||||
if training_args.do_predict:
|
||||
max_target_length = data_args.val_max_target_length
|
||||
if "test" not in datasets:
|
||||
if "test" not in raw_datasets:
|
||||
raise ValueError("--do_predict requires a test dataset")
|
||||
predict_dataset = datasets["test"]
|
||||
predict_dataset = raw_datasets["test"]
|
||||
if data_args.max_predict_samples is not None:
|
||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
||||
predict_dataset = predict_dataset.map(
|
||||
|
@ -23,6 +23,7 @@ import sys
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
import numpy as np
|
||||
from datasets import load_dataset, load_metric
|
||||
|
||||
@ -204,18 +205,19 @@ def main():
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
handlers=[logging.StreamHandler(sys.stdout)],
|
||||
)
|
||||
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
|
||||
|
||||
log_level = training_args.get_process_log_level()
|
||||
logger.setLevel(log_level)
|
||||
datasets.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
|
||||
# Log on each process the small summary:
|
||||
logger.warning(
|
||||
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
|
||||
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
|
||||
)
|
||||
# Set the verbosity to info of the Transformers logger (on main process only):
|
||||
if training_args.should_log:
|
||||
transformers.utils.logging.set_verbosity_info()
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
logger.info(f"Training/evaluation parameters {training_args}")
|
||||
|
||||
# Detecting last checkpoint.
|
||||
@ -250,10 +252,12 @@ def main():
|
||||
# download the dataset.
|
||||
if data_args.task_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
datasets = load_dataset("glue", data_args.task_name, cache_dir=model_args.cache_dir)
|
||||
raw_datasets = load_dataset("glue", data_args.task_name, cache_dir=model_args.cache_dir)
|
||||
elif data_args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
|
||||
raw_datasets = load_dataset(
|
||||
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
|
||||
)
|
||||
else:
|
||||
# Loading a dataset from your local files.
|
||||
# CSV/JSON training and evaluation files are needed.
|
||||
@ -277,10 +281,10 @@ def main():
|
||||
|
||||
if data_args.train_file.endswith(".csv"):
|
||||
# Loading a dataset from local csv files
|
||||
datasets = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir)
|
||||
raw_datasets = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir)
|
||||
else:
|
||||
# Loading a dataset from local json files
|
||||
datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir)
|
||||
raw_datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir)
|
||||
# See more about loading any type of standard or custom dataset at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
|
||||
@ -288,19 +292,19 @@ def main():
|
||||
if data_args.task_name is not None:
|
||||
is_regression = data_args.task_name == "stsb"
|
||||
if not is_regression:
|
||||
label_list = datasets["train"].features["label"].names
|
||||
label_list = raw_datasets["train"].features["label"].names
|
||||
num_labels = len(label_list)
|
||||
else:
|
||||
num_labels = 1
|
||||
else:
|
||||
# Trying to have good defaults here, don't hesitate to tweak to your needs.
|
||||
is_regression = datasets["train"].features["label"].dtype in ["float32", "float64"]
|
||||
is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"]
|
||||
if is_regression:
|
||||
num_labels = 1
|
||||
else:
|
||||
# A useful fast method:
|
||||
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
|
||||
label_list = datasets["train"].unique("label")
|
||||
label_list = raw_datasets["train"].unique("label")
|
||||
label_list.sort() # Let's sort it for determinism
|
||||
num_labels = len(label_list)
|
||||
|
||||
@ -332,12 +336,12 @@ def main():
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
|
||||
# Preprocessing the datasets
|
||||
# Preprocessing the raw_datasets
|
||||
if data_args.task_name is not None:
|
||||
sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
|
||||
else:
|
||||
# Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
|
||||
non_label_column_names = [name for name in datasets["train"].column_names if name != "label"]
|
||||
non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"]
|
||||
if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
|
||||
sentence1_key, sentence2_key = "sentence1", "sentence2"
|
||||
else:
|
||||
@ -396,30 +400,30 @@ def main():
|
||||
result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
|
||||
return result
|
||||
|
||||
datasets = datasets.map(
|
||||
raw_datasets = raw_datasets.map(
|
||||
preprocess_function,
|
||||
batched=True,
|
||||
load_from_cache_file=not data_args.overwrite_cache,
|
||||
desc="Running tokenizer on dataset",
|
||||
)
|
||||
if training_args.do_train:
|
||||
if "train" not in datasets:
|
||||
if "train" not in raw_datasets:
|
||||
raise ValueError("--do_train requires a train dataset")
|
||||
train_dataset = datasets["train"]
|
||||
train_dataset = raw_datasets["train"]
|
||||
if data_args.max_train_samples is not None:
|
||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||
|
||||
if training_args.do_eval:
|
||||
if "validation" not in datasets and "validation_matched" not in datasets:
|
||||
if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
|
||||
raise ValueError("--do_eval requires a validation dataset")
|
||||
eval_dataset = datasets["validation_matched" if data_args.task_name == "mnli" else "validation"]
|
||||
eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"]
|
||||
if data_args.max_eval_samples is not None:
|
||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
||||
|
||||
if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None:
|
||||
if "test" not in datasets and "test_matched" not in datasets:
|
||||
if "test" not in raw_datasets and "test_matched" not in raw_datasets:
|
||||
raise ValueError("--do_predict requires a test dataset")
|
||||
predict_dataset = datasets["test_matched" if data_args.task_name == "mnli" else "test"]
|
||||
predict_dataset = raw_datasets["test_matched" if data_args.task_name == "mnli" else "test"]
|
||||
if data_args.max_predict_samples is not None:
|
||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
||||
|
||||
@ -497,7 +501,7 @@ def main():
|
||||
eval_datasets = [eval_dataset]
|
||||
if data_args.task_name == "mnli":
|
||||
tasks.append("mnli-mm")
|
||||
eval_datasets.append(datasets["validation_mismatched"])
|
||||
eval_datasets.append(raw_datasets["validation_mismatched"])
|
||||
|
||||
for eval_dataset, task in zip(eval_datasets, tasks):
|
||||
metrics = trainer.evaluate(eval_dataset=eval_dataset)
|
||||
@ -518,7 +522,7 @@ def main():
|
||||
predict_datasets = [predict_dataset]
|
||||
if data_args.task_name == "mnli":
|
||||
tasks.append("mnli-mm")
|
||||
predict_datasets.append(datasets["test_mismatched"])
|
||||
predict_datasets.append(raw_datasets["test_mismatched"])
|
||||
|
||||
for predict_dataset, task in zip(predict_datasets, tasks):
|
||||
# Removing the `label` columns because it contains -1 and Trainer won't like that.
|
||||
|
@ -24,6 +24,7 @@ import sys
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
import numpy as np
|
||||
from datasets import load_dataset, load_metric
|
||||
|
||||
@ -174,19 +175,19 @@ def main():
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
handlers=[logging.StreamHandler(sys.stdout)],
|
||||
)
|
||||
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
|
||||
|
||||
log_level = training_args.get_process_log_level()
|
||||
logger.setLevel(log_level)
|
||||
datasets.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
|
||||
# Log on each process the small summary:
|
||||
logger.warning(
|
||||
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
|
||||
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
|
||||
)
|
||||
|
||||
# Set the verbosity to info of the Transformers logger (on main process only):
|
||||
if training_args.should_log:
|
||||
transformers.utils.logging.set_verbosity_info()
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
logger.info(f"Training/evaluation parameters {training_args}")
|
||||
|
||||
# Detecting last checkpoint.
|
||||
|
@ -25,6 +25,7 @@ import sys
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
import numpy as np
|
||||
from datasets import ClassLabel, load_dataset, load_metric
|
||||
|
||||
@ -195,18 +196,19 @@ def main():
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
handlers=[logging.StreamHandler(sys.stdout)],
|
||||
)
|
||||
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
|
||||
|
||||
log_level = training_args.get_process_log_level()
|
||||
logger.setLevel(log_level)
|
||||
datasets.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
|
||||
# Log on each process the small summary:
|
||||
logger.warning(
|
||||
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
|
||||
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
|
||||
)
|
||||
# Set the verbosity to info of the Transformers logger (on main process only):
|
||||
if training_args.should_log:
|
||||
transformers.utils.logging.set_verbosity_info()
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
logger.info(f"Training/evaluation parameters {training_args}")
|
||||
|
||||
# Detecting last checkpoint.
|
||||
@ -238,7 +240,9 @@ def main():
|
||||
# download the dataset.
|
||||
if data_args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
|
||||
raw_datasets = load_dataset(
|
||||
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
if data_args.train_file is not None:
|
||||
@ -248,16 +252,16 @@ def main():
|
||||
if data_args.test_file is not None:
|
||||
data_files["test"] = data_args.test_file
|
||||
extension = data_args.train_file.split(".")[-1]
|
||||
datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
||||
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
|
||||
if training_args.do_train:
|
||||
column_names = datasets["train"].column_names
|
||||
features = datasets["train"].features
|
||||
column_names = raw_datasets["train"].column_names
|
||||
features = raw_datasets["train"].features
|
||||
else:
|
||||
column_names = datasets["validation"].column_names
|
||||
features = datasets["validation"].features
|
||||
column_names = raw_datasets["validation"].column_names
|
||||
features = raw_datasets["validation"].features
|
||||
|
||||
if data_args.text_column_name is not None:
|
||||
text_column_name = data_args.text_column_name
|
||||
@ -288,7 +292,7 @@ def main():
|
||||
# No need to convert the labels since they are already ints.
|
||||
label_to_id = {i: i for i in range(len(label_list))}
|
||||
else:
|
||||
label_list = get_label_list(datasets["train"][label_column_name])
|
||||
label_list = get_label_list(raw_datasets["train"][label_column_name])
|
||||
label_to_id = {l: i for i, l in enumerate(label_list)}
|
||||
num_labels = len(label_list)
|
||||
|
||||
@ -381,9 +385,9 @@ def main():
|
||||
return tokenized_inputs
|
||||
|
||||
if training_args.do_train:
|
||||
if "train" not in datasets:
|
||||
if "train" not in raw_datasets:
|
||||
raise ValueError("--do_train requires a train dataset")
|
||||
train_dataset = datasets["train"]
|
||||
train_dataset = raw_datasets["train"]
|
||||
if data_args.max_train_samples is not None:
|
||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||
train_dataset = train_dataset.map(
|
||||
@ -395,9 +399,9 @@ def main():
|
||||
)
|
||||
|
||||
if training_args.do_eval:
|
||||
if "validation" not in datasets:
|
||||
if "validation" not in raw_datasets:
|
||||
raise ValueError("--do_eval requires a validation dataset")
|
||||
eval_dataset = datasets["validation"]
|
||||
eval_dataset = raw_datasets["validation"]
|
||||
if data_args.max_eval_samples is not None:
|
||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
||||
eval_dataset = eval_dataset.map(
|
||||
@ -409,9 +413,9 @@ def main():
|
||||
)
|
||||
|
||||
if training_args.do_predict:
|
||||
if "test" not in datasets:
|
||||
if "test" not in raw_datasets:
|
||||
raise ValueError("--do_predict requires a test dataset")
|
||||
predict_dataset = datasets["test"]
|
||||
predict_dataset = raw_datasets["test"]
|
||||
if data_args.max_predict_samples is not None:
|
||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
||||
predict_dataset = predict_dataset.map(
|
||||
|
@ -344,7 +344,7 @@ def main():
|
||||
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
# Preprocessing the raw_datasets.
|
||||
# Preprocessing the datasets.
|
||||
# First we tokenize all the texts.
|
||||
padding = "max_length" if args.pad_to_max_length else False
|
||||
|
||||
|
@ -250,6 +250,8 @@ def main():
|
||||
logger.setLevel(log_level)
|
||||
datasets.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
|
||||
# Log on each process the small summary:
|
||||
logger.warning(
|
||||
|
Loading…
Reference in New Issue
Block a user