Fix input data file extension in examples (#28741)

This commit is contained in:
Klaus Hipp 2024-01-29 11:06:31 +01:00 committed by GitHub
parent 5649c0cbb8
commit 39fa400969
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
23 changed files with 49 additions and 23 deletions

View File

@ -558,9 +558,10 @@ def main():
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.validation_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
datasets = load_dataset( datasets = load_dataset(

View File

@ -449,9 +449,10 @@ def main():
dataset_args = {} dataset_args = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.validation_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
dataset_args["keep_linebreaks"] = data_args.keep_linebreaks dataset_args["keep_linebreaks"] = data_args.keep_linebreaks

View File

@ -485,9 +485,10 @@ def main():
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.validation_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
datasets = load_dataset( datasets = load_dataset(

View File

@ -599,9 +599,10 @@ def main():
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.validation_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
datasets = load_dataset( datasets = load_dataset(

View File

@ -345,9 +345,10 @@ def main():
dataset_args = {} dataset_args = {}
if args.train_file is not None: if args.train_file is not None:
data_files["train"] = args.train_file data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None: if args.validation_file is not None:
data_files["validation"] = args.validation_file data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1] extension = args.validation_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks

View File

@ -351,9 +351,10 @@ def main():
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
data_files["train"] = args.train_file data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None: if args.validation_file is not None:
data_files["validation"] = args.validation_file data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1] extension = args.validation_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
raw_datasets = load_dataset(extension, data_files=data_files) raw_datasets = load_dataset(extension, data_files=data_files)

View File

@ -328,9 +328,10 @@ def main():
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.validation_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)

View File

@ -311,9 +311,10 @@ def main():
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.validation_file.split(".")[-1]
raw_datasets = load_dataset( raw_datasets = load_dataset(
extension, extension,
data_files=data_files, data_files=data_files,

View File

@ -357,9 +357,10 @@ def main():
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
data_files["train"] = args.train_file data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None: if args.validation_file is not None:
data_files["validation"] = args.validation_file data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1] extension = args.validation_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files) raw_datasets = load_dataset(extension, data_files=data_files)
# Trim a number of training examples # Trim a number of training examples
if args.debug: if args.debug:

View File

@ -362,11 +362,13 @@ def main():
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
data_files["train"] = args.train_file data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None: if args.validation_file is not None:
data_files["validation"] = args.validation_file data_files["validation"] = args.validation_file
extension = args.validation_file.split(".")[-1]
if args.test_file is not None: if args.test_file is not None:
data_files["test"] = args.test_file data_files["test"] = args.test_file
extension = args.train_file.split(".")[-1] extension = args.test_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files, field="data") raw_datasets = load_dataset(extension, data_files=data_files, field="data")
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets. # https://huggingface.co/docs/datasets/loading_datasets.

View File

@ -410,11 +410,13 @@ def main():
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
data_files["train"] = args.train_file data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None: if args.validation_file is not None:
data_files["validation"] = args.validation_file data_files["validation"] = args.validation_file
extension = args.validation_file.split(".")[-1]
if args.test_file is not None: if args.test_file is not None:
data_files["test"] = args.test_file data_files["test"] = args.test_file
extension = args.train_file.split(".")[-1] extension = args.test_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files, field="data") raw_datasets = load_dataset(extension, data_files=data_files, field="data")
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets. # https://huggingface.co/docs/datasets/loading_datasets.

View File

@ -404,9 +404,10 @@ def main():
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
data_files["train"] = args.train_file data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None: if args.validation_file is not None:
data_files["validation"] = args.validation_file data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1] extension = args.validation_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files) raw_datasets = load_dataset(extension, data_files=data_files)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets. # https://huggingface.co/docs/datasets/loading_datasets.

View File

@ -311,11 +311,13 @@ def main():
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.validation_file.split(".")[-1]
if data_args.test_file is not None: if data_args.test_file is not None:
data_files["test"] = data_args.test_file data_files["test"] = data_args.test_file
extension = data_args.train_file.split(".")[-1] extension = data_args.test_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets. # https://huggingface.co/docs/datasets/loading_datasets.

View File

@ -339,9 +339,10 @@ def main():
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
data_files["train"] = args.train_file data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None: if args.validation_file is not None:
data_files["validation"] = args.validation_file data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1] extension = args.validation_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files) raw_datasets = load_dataset(extension, data_files=data_files)
# Trim a number of training examples # Trim a number of training examples
if args.debug: if args.debug:

View File

@ -384,9 +384,10 @@ def main():
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
data_files["train"] = args.train_file data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None: if args.validation_file is not None:
data_files["validation"] = args.validation_file data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1] extension = args.validation_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files) raw_datasets = load_dataset(extension, data_files=data_files)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets. # https://huggingface.co/docs/datasets/loading_datasets.

View File

@ -297,9 +297,10 @@ def main():
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.validation_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)

View File

@ -285,9 +285,10 @@ def main():
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
data_files["train"] = args.train_file data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None: if args.validation_file is not None:
data_files["validation"] = args.validation_file data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1] extension = args.validation_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files) raw_datasets = load_dataset(extension, data_files=data_files)
# Trim a number of training examples # Trim a number of training examples
if args.debug: if args.debug:

View File

@ -271,9 +271,10 @@ def main():
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.validation_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
datasets = load_dataset(extension, data_files=data_files) datasets = load_dataset(extension, data_files=data_files)

View File

@ -517,9 +517,10 @@ if __name__ == "__main__":
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.validation_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
datasets = load_dataset(extension, data_files=data_files) datasets = load_dataset(extension, data_files=data_files)

View File

@ -341,9 +341,10 @@ def main():
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.validation_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
raw_datasets = load_dataset( raw_datasets = load_dataset(

View File

@ -320,9 +320,10 @@ def main():
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.validation_file.split(".")[-1]
raw_datasets = load_dataset( raw_datasets = load_dataset(
extension, extension,
data_files=data_files, data_files=data_files,

View File

@ -260,9 +260,10 @@ def main():
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.validation_file.split(".")[-1]
raw_datasets = load_dataset( raw_datasets = load_dataset(
extension, extension,
data_files=data_files, data_files=data_files,

View File

@ -730,9 +730,10 @@ def main():
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
data_files["train"] = args.train_file data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None: if args.validation_file is not None:
data_files["validation"] = args.validation_file data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1] extension = args.validation_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files) raw_datasets = load_dataset(extension, data_files=data_files)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets. # https://huggingface.co/docs/datasets/loading_datasets.