mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-01 02:31:11 +06:00
Fix input data file extension in examples (#28741)
This commit is contained in:
parent
5649c0cbb8
commit
39fa400969
@ -558,9 +558,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
if extension == "txt":
|
if extension == "txt":
|
||||||
extension = "text"
|
extension = "text"
|
||||||
datasets = load_dataset(
|
datasets = load_dataset(
|
||||||
|
@ -449,9 +449,10 @@ def main():
|
|||||||
dataset_args = {}
|
dataset_args = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
if extension == "txt":
|
if extension == "txt":
|
||||||
extension = "text"
|
extension = "text"
|
||||||
dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
|
dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
|
||||||
|
@ -485,9 +485,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
if extension == "txt":
|
if extension == "txt":
|
||||||
extension = "text"
|
extension = "text"
|
||||||
datasets = load_dataset(
|
datasets = load_dataset(
|
||||||
|
@ -599,9 +599,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
if extension == "txt":
|
if extension == "txt":
|
||||||
extension = "text"
|
extension = "text"
|
||||||
datasets = load_dataset(
|
datasets = load_dataset(
|
||||||
|
@ -345,9 +345,10 @@ def main():
|
|||||||
dataset_args = {}
|
dataset_args = {}
|
||||||
if args.train_file is not None:
|
if args.train_file is not None:
|
||||||
data_files["train"] = args.train_file
|
data_files["train"] = args.train_file
|
||||||
|
extension = args.train_file.split(".")[-1]
|
||||||
if args.validation_file is not None:
|
if args.validation_file is not None:
|
||||||
data_files["validation"] = args.validation_file
|
data_files["validation"] = args.validation_file
|
||||||
extension = args.train_file.split(".")[-1]
|
extension = args.validation_file.split(".")[-1]
|
||||||
if extension == "txt":
|
if extension == "txt":
|
||||||
extension = "text"
|
extension = "text"
|
||||||
dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks
|
dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks
|
||||||
|
@ -351,9 +351,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if args.train_file is not None:
|
if args.train_file is not None:
|
||||||
data_files["train"] = args.train_file
|
data_files["train"] = args.train_file
|
||||||
|
extension = args.train_file.split(".")[-1]
|
||||||
if args.validation_file is not None:
|
if args.validation_file is not None:
|
||||||
data_files["validation"] = args.validation_file
|
data_files["validation"] = args.validation_file
|
||||||
extension = args.train_file.split(".")[-1]
|
extension = args.validation_file.split(".")[-1]
|
||||||
if extension == "txt":
|
if extension == "txt":
|
||||||
extension = "text"
|
extension = "text"
|
||||||
raw_datasets = load_dataset(extension, data_files=data_files)
|
raw_datasets = load_dataset(extension, data_files=data_files)
|
||||||
|
@ -328,9 +328,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
if extension == "txt":
|
if extension == "txt":
|
||||||
extension = "text"
|
extension = "text"
|
||||||
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
||||||
|
@ -311,9 +311,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(
|
raw_datasets = load_dataset(
|
||||||
extension,
|
extension,
|
||||||
data_files=data_files,
|
data_files=data_files,
|
||||||
|
@ -357,9 +357,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if args.train_file is not None:
|
if args.train_file is not None:
|
||||||
data_files["train"] = args.train_file
|
data_files["train"] = args.train_file
|
||||||
|
extension = args.train_file.split(".")[-1]
|
||||||
if args.validation_file is not None:
|
if args.validation_file is not None:
|
||||||
data_files["validation"] = args.validation_file
|
data_files["validation"] = args.validation_file
|
||||||
extension = args.train_file.split(".")[-1]
|
extension = args.validation_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(extension, data_files=data_files)
|
raw_datasets = load_dataset(extension, data_files=data_files)
|
||||||
# Trim a number of training examples
|
# Trim a number of training examples
|
||||||
if args.debug:
|
if args.debug:
|
||||||
|
@ -362,11 +362,13 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if args.train_file is not None:
|
if args.train_file is not None:
|
||||||
data_files["train"] = args.train_file
|
data_files["train"] = args.train_file
|
||||||
|
extension = args.train_file.split(".")[-1]
|
||||||
if args.validation_file is not None:
|
if args.validation_file is not None:
|
||||||
data_files["validation"] = args.validation_file
|
data_files["validation"] = args.validation_file
|
||||||
|
extension = args.validation_file.split(".")[-1]
|
||||||
if args.test_file is not None:
|
if args.test_file is not None:
|
||||||
data_files["test"] = args.test_file
|
data_files["test"] = args.test_file
|
||||||
extension = args.train_file.split(".")[-1]
|
extension = args.test_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(extension, data_files=data_files, field="data")
|
raw_datasets = load_dataset(extension, data_files=data_files, field="data")
|
||||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||||
|
@ -410,11 +410,13 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if args.train_file is not None:
|
if args.train_file is not None:
|
||||||
data_files["train"] = args.train_file
|
data_files["train"] = args.train_file
|
||||||
|
extension = args.train_file.split(".")[-1]
|
||||||
if args.validation_file is not None:
|
if args.validation_file is not None:
|
||||||
data_files["validation"] = args.validation_file
|
data_files["validation"] = args.validation_file
|
||||||
|
extension = args.validation_file.split(".")[-1]
|
||||||
if args.test_file is not None:
|
if args.test_file is not None:
|
||||||
data_files["test"] = args.test_file
|
data_files["test"] = args.test_file
|
||||||
extension = args.train_file.split(".")[-1]
|
extension = args.test_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(extension, data_files=data_files, field="data")
|
raw_datasets = load_dataset(extension, data_files=data_files, field="data")
|
||||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||||
|
@ -404,9 +404,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if args.train_file is not None:
|
if args.train_file is not None:
|
||||||
data_files["train"] = args.train_file
|
data_files["train"] = args.train_file
|
||||||
|
extension = args.train_file.split(".")[-1]
|
||||||
if args.validation_file is not None:
|
if args.validation_file is not None:
|
||||||
data_files["validation"] = args.validation_file
|
data_files["validation"] = args.validation_file
|
||||||
extension = args.train_file.split(".")[-1]
|
extension = args.validation_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(extension, data_files=data_files)
|
raw_datasets = load_dataset(extension, data_files=data_files)
|
||||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||||
|
@ -311,11 +311,13 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
if data_args.test_file is not None:
|
if data_args.test_file is not None:
|
||||||
data_files["test"] = data_args.test_file
|
data_files["test"] = data_args.test_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.test_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
||||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||||
|
@ -339,9 +339,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if args.train_file is not None:
|
if args.train_file is not None:
|
||||||
data_files["train"] = args.train_file
|
data_files["train"] = args.train_file
|
||||||
|
extension = args.train_file.split(".")[-1]
|
||||||
if args.validation_file is not None:
|
if args.validation_file is not None:
|
||||||
data_files["validation"] = args.validation_file
|
data_files["validation"] = args.validation_file
|
||||||
extension = args.train_file.split(".")[-1]
|
extension = args.validation_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(extension, data_files=data_files)
|
raw_datasets = load_dataset(extension, data_files=data_files)
|
||||||
# Trim a number of training examples
|
# Trim a number of training examples
|
||||||
if args.debug:
|
if args.debug:
|
||||||
|
@ -384,9 +384,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if args.train_file is not None:
|
if args.train_file is not None:
|
||||||
data_files["train"] = args.train_file
|
data_files["train"] = args.train_file
|
||||||
|
extension = args.train_file.split(".")[-1]
|
||||||
if args.validation_file is not None:
|
if args.validation_file is not None:
|
||||||
data_files["validation"] = args.validation_file
|
data_files["validation"] = args.validation_file
|
||||||
extension = args.train_file.split(".")[-1]
|
extension = args.validation_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(extension, data_files=data_files)
|
raw_datasets = load_dataset(extension, data_files=data_files)
|
||||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||||
|
@ -297,9 +297,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
if extension == "txt":
|
if extension == "txt":
|
||||||
extension = "text"
|
extension = "text"
|
||||||
dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
||||||
|
@ -285,9 +285,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if args.train_file is not None:
|
if args.train_file is not None:
|
||||||
data_files["train"] = args.train_file
|
data_files["train"] = args.train_file
|
||||||
|
extension = args.train_file.split(".")[-1]
|
||||||
if args.validation_file is not None:
|
if args.validation_file is not None:
|
||||||
data_files["validation"] = args.validation_file
|
data_files["validation"] = args.validation_file
|
||||||
extension = args.train_file.split(".")[-1]
|
extension = args.validation_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(extension, data_files=data_files)
|
raw_datasets = load_dataset(extension, data_files=data_files)
|
||||||
# Trim a number of training examples
|
# Trim a number of training examples
|
||||||
if args.debug:
|
if args.debug:
|
||||||
|
@ -271,9 +271,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
if extension == "txt":
|
if extension == "txt":
|
||||||
extension = "text"
|
extension = "text"
|
||||||
datasets = load_dataset(extension, data_files=data_files)
|
datasets = load_dataset(extension, data_files=data_files)
|
||||||
|
@ -517,9 +517,10 @@ if __name__ == "__main__":
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
if extension == "txt":
|
if extension == "txt":
|
||||||
extension = "text"
|
extension = "text"
|
||||||
datasets = load_dataset(extension, data_files=data_files)
|
datasets = load_dataset(extension, data_files=data_files)
|
||||||
|
@ -341,9 +341,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
if extension == "txt":
|
if extension == "txt":
|
||||||
extension = "text"
|
extension = "text"
|
||||||
raw_datasets = load_dataset(
|
raw_datasets = load_dataset(
|
||||||
|
@ -320,9 +320,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(
|
raw_datasets = load_dataset(
|
||||||
extension,
|
extension,
|
||||||
data_files=data_files,
|
data_files=data_files,
|
||||||
|
@ -260,9 +260,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(
|
raw_datasets = load_dataset(
|
||||||
extension,
|
extension,
|
||||||
data_files=data_files,
|
data_files=data_files,
|
||||||
|
@ -730,9 +730,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if args.train_file is not None:
|
if args.train_file is not None:
|
||||||
data_files["train"] = args.train_file
|
data_files["train"] = args.train_file
|
||||||
|
extension = args.train_file.split(".")[-1]
|
||||||
if args.validation_file is not None:
|
if args.validation_file is not None:
|
||||||
data_files["validation"] = args.validation_file
|
data_files["validation"] = args.validation_file
|
||||||
extension = args.train_file.split(".")[-1]
|
extension = args.validation_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(extension, data_files=data_files)
|
raw_datasets = load_dataset(extension, data_files=data_files)
|
||||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||||
|
Loading…
Reference in New Issue
Block a user