diff --git a/examples/flax/_tests_requirements.txt b/examples/flax/_tests_requirements.txt index 543d140e652..2e93a1f2c54 100644 --- a/examples/flax/_tests_requirements.txt +++ b/examples/flax/_tests_requirements.txt @@ -1,4 +1,4 @@ -datasets >= 1.13.3,<2.20.0 # Temporary upper version +datasets >= 1.13.3 pytest<8.0.1 conllu nltk diff --git a/examples/flax/image-captioning/run_image_captioning_flax.py b/examples/flax/image-captioning/run_image_captioning_flax.py index f30274215ca..879372a7523 100644 --- a/examples/flax/image-captioning/run_image_captioning_flax.py +++ b/examples/flax/image-captioning/run_image_captioning_flax.py @@ -195,9 +195,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -458,6 +458,7 @@ def main(): keep_in_memory=False, data_dir=data_args.data_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/flax/language-modeling/run_bart_dlm_flax.py b/examples/flax/language-modeling/run_bart_dlm_flax.py index 53a8da676e0..6bf98d25fc5 100644 --- a/examples/flax/language-modeling/run_bart_dlm_flax.py +++ b/examples/flax/language-modeling/run_bart_dlm_flax.py @@ -191,6 +191,16 @@ class DataTrainingArguments: dataset_config_name: Optional[str] = field( default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} ) + trust_remote_code: bool = field( + default=False, + metadata={ + "help": ( + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." + ) + }, + ) train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) validation_file: Optional[str] = field( default=None, @@ -518,6 +528,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, num_proc=data_args.preprocessing_num_workers, + trust_remote_code=data_args.trust_remote_code, ) if "validation" not in datasets.keys(): @@ -528,6 +539,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, num_proc=data_args.preprocessing_num_workers, + trust_remote_code=data_args.trust_remote_code, ) datasets["train"] = load_dataset( data_args.dataset_name, @@ -536,6 +548,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, num_proc=data_args.preprocessing_num_workers, + trust_remote_code=data_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/flax/language-modeling/run_clm_flax.py b/examples/flax/language-modeling/run_clm_flax.py index 5f40b6254b1..a12bc6d3c8d 100755 --- a/examples/flax/language-modeling/run_clm_flax.py +++ b/examples/flax/language-modeling/run_clm_flax.py @@ -182,9 +182,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -408,6 +408,7 @@ def main(): keep_in_memory=False, token=model_args.token, num_proc=data_args.preprocessing_num_workers, + trust_remote_code=model_args.trust_remote_code, ) if "validation" not in dataset.keys(): @@ -418,6 +419,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, num_proc=data_args.preprocessing_num_workers, + trust_remote_code=model_args.trust_remote_code, ) dataset["train"] = load_dataset( data_args.dataset_name, @@ -426,6 +428,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, num_proc=data_args.preprocessing_num_workers, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/flax/language-modeling/run_mlm_flax.py b/examples/flax/language-modeling/run_mlm_flax.py index a13c62e0fdf..4d837e9c113 100755 --- a/examples/flax/language-modeling/run_mlm_flax.py +++ b/examples/flax/language-modeling/run_mlm_flax.py @@ -188,9 +188,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -446,6 +446,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, num_proc=data_args.preprocessing_num_workers, + trust_remote_code=model_args.trust_remote_code, ) if "validation" not in datasets.keys(): @@ -456,6 +457,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, num_proc=data_args.preprocessing_num_workers, + trust_remote_code=model_args.trust_remote_code, ) datasets["train"] = load_dataset( data_args.dataset_name, @@ -464,6 +466,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, num_proc=data_args.preprocessing_num_workers, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/flax/language-modeling/run_t5_mlm_flax.py b/examples/flax/language-modeling/run_t5_mlm_flax.py index c4b47711d99..c133824fcc2 100755 --- a/examples/flax/language-modeling/run_t5_mlm_flax.py +++ b/examples/flax/language-modeling/run_t5_mlm_flax.py @@ -192,6 +192,16 @@ class DataTrainingArguments: dataset_config_name: Optional[str] = field( default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} ) + trust_remote_code: bool = field( + default=False, + metadata={ + "help": ( + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." + ) + }, + ) train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) validation_file: Optional[str] = field( default=None, @@ -560,6 +570,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, num_proc=data_args.preprocessing_num_workers, + trust_remote_code=data_args.trust_remote_code, ) if "validation" not in datasets.keys(): @@ -570,6 +581,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, num_proc=data_args.preprocessing_num_workers, + trust_remote_code=data_args.trust_remote_code, ) datasets["train"] = load_dataset( data_args.dataset_name, @@ -578,6 +590,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, num_proc=data_args.preprocessing_num_workers, + trust_remote_code=data_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py index 16a744ddc32..f80cd8a0341 100644 --- a/examples/flax/question-answering/run_qa.py +++ b/examples/flax/question-answering/run_qa.py @@ -168,9 +168,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -498,6 +498,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: # Loading the dataset from local csv or json file. diff --git a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py index 15df6cb5818..d911797cb9f 100644 --- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py +++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py @@ -136,6 +136,16 @@ class DataTrainingArguments: dataset_config_name: Optional[str] = field( default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} ) + trust_remote_code: bool = field( + default=False, + metadata={ + "help": ( + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." + ) + }, + ) text_column: Optional[str] = field( default=None, metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."}, @@ -442,6 +452,7 @@ def main(): cache_dir=data_args.dataset_cache_dir, num_proc=data_args.preprocessing_num_workers, token=True if model_args.use_auth_token else None, + trust_remote_code=data_args.trust_remote_code, ) if training_args.do_eval: @@ -452,6 +463,7 @@ def main(): cache_dir=data_args.dataset_cache_dir, num_proc=data_args.preprocessing_num_workers, token=True if model_args.use_auth_token else None, + trust_remote_code=data_args.trust_remote_code, ) if not training_args.do_train and not training_args.do_eval: diff --git a/examples/flax/summarization/run_summarization_flax.py b/examples/flax/summarization/run_summarization_flax.py index bead750720e..36407df3b41 100644 --- a/examples/flax/summarization/run_summarization_flax.py +++ b/examples/flax/summarization/run_summarization_flax.py @@ -201,9 +201,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -485,6 +485,7 @@ def main(): cache_dir=model_args.cache_dir, keep_in_memory=False, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/flax/test_flax_examples.py b/examples/flax/test_flax_examples.py index 9fc424c1a75..c81d6378185 100644 --- a/examples/flax/test_flax_examples.py +++ b/examples/flax/test_flax_examples.py @@ -265,6 +265,7 @@ class ExamplesTests(TestCasePlus): --dataset_config clean --train_split_name validation --eval_split_name validation + --trust_remote_code --output_dir {tmp_dir} --overwrite_output_dir --num_train_epochs=2 diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py index ecb52ceb086..51f66777cd8 100644 --- a/examples/flax/token-classification/run_flax_ner.py +++ b/examples/flax/token-classification/run_flax_ner.py @@ -170,9 +170,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -449,6 +449,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: # Loading the dataset from local csv or json file. diff --git a/examples/pytorch/_tests_requirements.txt b/examples/pytorch/_tests_requirements.txt index 21096fc96a1..819b49c799a 100644 --- a/examples/pytorch/_tests_requirements.txt +++ b/examples/pytorch/_tests_requirements.txt @@ -13,7 +13,7 @@ streamlit elasticsearch nltk pandas -datasets >= 1.13.3,<2.20.0 # Temporary upper version +datasets >= 1.13.3 fire pytest<8.0.1 conllu diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py index 70a3c77c200..269199a5b70 100644 --- a/examples/pytorch/audio-classification/run_audio_classification.py +++ b/examples/pytorch/audio-classification/run_audio_classification.py @@ -165,9 +165,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -261,12 +261,14 @@ def main(): data_args.dataset_config_name, split=data_args.train_split_name, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) raw_datasets["eval"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) if data_args.audio_column_name not in raw_datasets["train"].column_names: diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py index c99bfab9cf2..726ce3c4421 100644 --- a/examples/pytorch/contrastive-image-text/run_clip.py +++ b/examples/pytorch/contrastive-image-text/run_clip.py @@ -99,9 +99,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -305,6 +305,7 @@ def main(): keep_in_memory=False, data_dir=data_args.data_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py index ea9f3096b0b..63c1a0a7600 100755 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py @@ -164,9 +164,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -242,6 +242,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index b8f69b4b6f3..2d0dd070e86 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -150,12 +150,11 @@ def parse_args(): parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument( "--trust_remote_code", - type=bool, - default=False, + action="store_true", help=( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ), ) parser.add_argument( @@ -284,7 +283,7 @@ def main(): # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. - dataset = load_dataset(args.dataset_name) + dataset = load_dataset(args.dataset_name, trust_remote_code=args.trust_remote_code) else: data_files = {} if args.train_dir is not None: diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py index a200fc87887..dd94c3e5104 100644 --- a/examples/pytorch/image-pretraining/run_mae.py +++ b/examples/pytorch/image-pretraining/run_mae.py @@ -63,6 +63,16 @@ class DataTrainingArguments: dataset_config_name: Optional[str] = field( default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} ) + trust_remote_code: bool = field( + default=False, + metadata={ + "help": ( + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." + ) + }, + ) image_column_name: Optional[str] = field( default=None, metadata={"help": "The column name of the images in the files."} ) @@ -225,6 +235,7 @@ def main(): data_files=data_args.data_files, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=data_args.trust_remote_code, ) # If we don't have a validation split, split off a percentage of train as validation. diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py index 5df8bfdcfed..ce90aeb75c0 100644 --- a/examples/pytorch/image-pretraining/run_mim.py +++ b/examples/pytorch/image-pretraining/run_mim.py @@ -166,9 +166,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -299,6 +299,7 @@ def main(): data_files=data_args.data_files, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) # If we don't have a validation split, split off a percentage of train as validation. diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py index c77b8077d87..0008f2bd7fc 100644 --- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py +++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py @@ -197,12 +197,11 @@ def parse_args(): ) parser.add_argument( "--trust_remote_code", - type=bool, - default=False, + action="store_true", help=( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ), ) parser.add_argument( @@ -441,6 +440,7 @@ def main(): data_files=args.data_files, cache_dir=args.cache_dir, token=args.token, + trust_remote_code=args.trust_remote_code, ) # If we don't have a validation split, split off a percentage of train as validation. diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation.py b/examples/pytorch/instance-segmentation/run_instance_segmentation.py index 3a5d08b2505..e8d7ee04891 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py @@ -68,6 +68,16 @@ class Arguments: "help": "Name of a dataset from the hub (could be your own, possibly private dataset hosted on the hub)." }, ) + trust_remote_code: bool = field( + default=False, + metadata={ + "help": ( + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." + ) + }, + ) image_height: Optional[int] = field(default=512, metadata={"help": "Image height after resizing."}) image_width: Optional[int] = field(default=512, metadata={"help": "Image width after resizing."}) token: str = field( @@ -364,7 +374,7 @@ def main(): # Load dataset, prepare splits # ------------------------------------------------------------------------------------------------ - dataset = load_dataset(args.dataset_name) + dataset = load_dataset(args.dataset_name, trust_remote_code=args.trust_remote_code) # We need to specify the label2id mapping for the model # it is a mapping from semantic class name to class index. diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py index f9b96389166..7c0eb31068b 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py @@ -71,6 +71,15 @@ def parse_args(): help="Name of the dataset on the hub.", default="qubvel-hf/ade20k-mini", ) + parser.add_argument( + "--trust_remote_code", + action="store_true", + help=( + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." + ), + ) parser.add_argument( "--image_height", type=int, @@ -425,7 +434,7 @@ def main(): # In distributed training, the load_dataset function guarantees that only one local process can concurrently # download the dataset. - dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir) + dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir, trust_remote_code=args.trust_remote_code) # We need to specify the label2id mapping for the model # it is a mapping from semantic class name to class index. diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index 9c26f32bdd4..c0db5703722 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -124,9 +124,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -312,6 +312,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, streaming=data_args.streaming, + trust_remote_code=model_args.trust_remote_code, ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( @@ -321,6 +322,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, streaming=data_args.streaming, + trust_remote_code=model_args.trust_remote_code, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, @@ -329,6 +331,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, streaming=data_args.streaming, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 5ae2943ebb8..f93935f406e 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -195,12 +195,11 @@ def parse_args(): parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument( "--trust_remote_code", - type=bool, - default=False, + action="store_true", help=( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ), ) parser.add_argument( @@ -327,17 +326,21 @@ def main(): # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) + raw_datasets = load_dataset( + args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code + ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[:{args.validation_split_percentage}%]", + trust_remote_code=args.trust_remote_code, ) raw_datasets["train"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[{args.validation_split_percentage}%:]", + trust_remote_code=args.trust_remote_code, ) else: data_files = {} diff --git a/examples/pytorch/language-modeling/run_fim.py b/examples/pytorch/language-modeling/run_fim.py index ac4154e3198..fa7ebbfd747 100644 --- a/examples/pytorch/language-modeling/run_fim.py +++ b/examples/pytorch/language-modeling/run_fim.py @@ -127,9 +127,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -382,6 +382,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, streaming=data_args.streaming, + trust_remote_code=model_args.trust_remote_code, ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( @@ -391,6 +392,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, streaming=data_args.streaming, + trust_remote_code=model_args.trust_remote_code, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, @@ -399,6 +401,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, streaming=data_args.streaming, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/pytorch/language-modeling/run_fim_no_trainer.py b/examples/pytorch/language-modeling/run_fim_no_trainer.py index fd62c647a7c..23de80fc829 100644 --- a/examples/pytorch/language-modeling/run_fim_no_trainer.py +++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py @@ -257,12 +257,11 @@ def parse_args(): parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument( "--trust_remote_code", - type=bool, - default=False, + action="store_true", help=( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ), ) parser.add_argument( @@ -395,17 +394,21 @@ def main(): # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) + raw_datasets = load_dataset( + args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code + ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[:{args.validation_split_percentage}%]", + trust_remote_code=args.trust_remote_code, ) raw_datasets["train"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[{args.validation_split_percentage}%:]", + trust_remote_code=args.trust_remote_code, ) else: data_files = {} diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index e4f005a562f..3485b9ca1b0 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -121,9 +121,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -324,6 +324,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, streaming=data_args.streaming, + trust_remote_code=model_args.trust_remote_code, ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( @@ -333,6 +334,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, streaming=data_args.streaming, + trust_remote_code=model_args.trust_remote_code, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, @@ -341,6 +343,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, streaming=data_args.streaming, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 89cbeb74c05..42cb0008e7b 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -202,12 +202,11 @@ def parse_args(): parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument( "--trust_remote_code", - type=bool, - default=False, + action="store_true", help=( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ), ) parser.add_argument( @@ -334,17 +333,21 @@ def main(): # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) + raw_datasets = load_dataset( + args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code + ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[:{args.validation_split_percentage}%]", + trust_remote_code=args.trust_remote_code, ) raw_datasets["train"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[{args.validation_split_percentage}%:]", + trust_remote_code=args.trust_remote_code, ) else: data_files = {} diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index c16469afc76..b51f1acbf37 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -133,6 +133,16 @@ class DataTrainingArguments: dataset_config_name: Optional[str] = field( default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} ) + trust_remote_code: bool = field( + default=False, + metadata={ + "help": ( + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." + ) + }, + ) train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) validation_file: Optional[str] = field( default=None, @@ -292,6 +302,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=data_args.trust_remote_code, ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( @@ -300,6 +311,7 @@ def main(): split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=data_args.trust_remote_code, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, @@ -307,6 +319,7 @@ def main(): split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=data_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index f2c98d159dc..bf293bb190c 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -184,12 +184,11 @@ def parse_args(): parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument( "--trust_remote_code", - type=bool, - default=False, + action="store_true", help=( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ), ) parser.add_argument( @@ -351,7 +350,9 @@ def main(): # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) + raw_datasets = load_dataset( + args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code + ) else: data_files = {} if args.train_file is not None: diff --git a/examples/pytorch/object-detection/run_object_detection.py b/examples/pytorch/object-detection/run_object_detection.py index 62e60acc723..e82d9029139 100644 --- a/examples/pytorch/object-detection/run_object_detection.py +++ b/examples/pytorch/object-detection/run_object_detection.py @@ -313,9 +313,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -383,7 +383,9 @@ def main(): # Load dataset, prepare splits # ------------------------------------------------------------------------------------------------ - dataset = load_dataset(data_args.dataset_name, cache_dir=model_args.cache_dir) + dataset = load_dataset( + data_args.dataset_name, cache_dir=model_args.cache_dir, trust_remote_code=model_args.trust_remote_code + ) # If we don't have a validation split, split off a percentage of train as validation data_args.train_val_split = None if "validation" in dataset.keys() else data_args.train_val_split diff --git a/examples/pytorch/object-detection/run_object_detection_no_trainer.py b/examples/pytorch/object-detection/run_object_detection_no_trainer.py index 8bea58aa506..f79c9ddf9bb 100644 --- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py +++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py @@ -340,12 +340,11 @@ def parse_args(): parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument( "--trust_remote_code", - type=bool, - default=False, + action="store_true", help=( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ), ) parser.add_argument( @@ -445,7 +444,7 @@ def main(): # Load dataset # In distributed training, the load_dataset function guarantees that only one local process can concurrently # download the dataset. - dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir) + dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir, trust_remote_code=args.trust_remote_code) # If we don't have a validation split, split off a percentage of train as validation. args.train_val_split = None if "validation" in dataset.keys() else args.train_val_split diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index 5b588c2fe85..6e5ddbf0810 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -93,9 +93,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -301,6 +301,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py index bdf9c44dfc7..e528a1bc53b 100755 --- a/examples/pytorch/question-answering/run_qa_beam_search.py +++ b/examples/pytorch/question-answering/run_qa_beam_search.py @@ -101,6 +101,16 @@ class DataTrainingArguments: dataset_config_name: Optional[str] = field( default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} ) + trust_remote_code: bool = field( + default=False, + metadata={ + "help": ( + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." + ) + }, + ) train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) validation_file: Optional[str] = field( default=None, @@ -289,6 +299,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=data_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index 0d37cda0b9b..b05f0c6f503 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -100,6 +100,15 @@ def parse_args(): default=None, help="The configuration name of the dataset to use (via the datasets library).", ) + parser.add_argument( + "--trust_remote_code", + action="store_true", + help=( + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." + ), + ) parser.add_argument( "--train_file", type=str, default=None, help="A csv or a json file containing the training data." ) @@ -356,7 +365,9 @@ def main(): # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) + raw_datasets = load_dataset( + args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code + ) else: data_files = {} if args.train_file is not None: diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index 5aeeff2440d..c0f98ce2331 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -275,12 +275,11 @@ def parse_args(): parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument( "--trust_remote_code", - type=bool, - default=False, + action="store_true", help=( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ), ) parser.add_argument( @@ -404,7 +403,9 @@ def main(): # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) + raw_datasets = load_dataset( + args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code + ) else: data_files = {} if args.train_file is not None: diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py index 421341c21dc..fffd643a650 100644 --- a/examples/pytorch/question-answering/run_seq2seq_qa.py +++ b/examples/pytorch/question-answering/run_seq2seq_qa.py @@ -93,9 +93,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -346,6 +346,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py index 99ef3a871ee..6aa66b9c48a 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py @@ -165,9 +165,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -233,7 +233,9 @@ def main(): # In distributed training, the load_dataset function guarantees that only one local process can concurrently # download the dataset. # TODO support datasets from local folders - dataset = load_dataset(data_args.dataset_name, cache_dir=model_args.cache_dir) + dataset = load_dataset( + data_args.dataset_name, cache_dir=model_args.cache_dir, trust_remote_code=model_args.trust_remote_code + ) # Rename column names to standardized names (only "image" and "label" need to be present) if "pixel_values" in dataset["train"].column_names: diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index 35af08ecfcf..4aa26f9aab1 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -180,12 +180,11 @@ def parse_args(): parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument( "--trust_remote_code", - type=bool, - default=False, + action="store_true", help=( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ), ) parser.add_argument( @@ -294,7 +293,7 @@ def main(): # In distributed training, the load_dataset function guarantees that only one local process can concurrently # download the dataset. # TODO support datasets from local folders - dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir) + dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir, trust_remote_code=args.trust_remote_code) # Rename column names to standardized names (only "image" and "label" need to be present) if "pixel_values" in dataset["train"].column_names: diff --git a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py index 9592a1f6e4d..62b15c0f313 100755 --- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py +++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py @@ -71,6 +71,15 @@ def parse_args(): required=True, help="The names of the training data set splits to use (via the datasets library).", ) + parser.add_argument( + "--trust_remote_code", + action="store_true", + help=( + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." + ), + ) parser.add_argument( "--preprocessing_num_workers", type=int, @@ -446,6 +455,7 @@ def main(): dataset_config_name, split=train_split_name, cache_dir=args.cache_dir, + trust_remote_code=args.trust_remote_code, ) datasets_splits.append(dataset_split) diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py index 6a05f342b3f..ac92612ea8f 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py @@ -255,9 +255,9 @@ class DataTrainingArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -454,6 +454,7 @@ def main(): data_args.dataset_config_name, split=data_args.train_split_name, token=data_args.token, + trust_remote_code=data_args.trust_remote_code, ) if data_args.audio_column_name not in raw_datasets["train"].column_names: @@ -479,6 +480,7 @@ def main(): data_args.dataset_config_name, split=data_args.eval_split_name, token=data_args.token, + trust_remote_code=data_args.trust_remote_code, ) if data_args.max_eval_samples is not None: diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py index bf3241c61da..9a93910739e 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py @@ -245,9 +245,9 @@ class DataTrainingArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -434,6 +434,7 @@ def main(): data_args.dataset_config_name, split=data_args.train_split_name, token=data_args.token, + trust_remote_code=data_args.trust_remote_code, ) if data_args.audio_column_name not in raw_datasets["train"].column_names: @@ -459,6 +460,7 @@ def main(): data_args.dataset_config_name, split=data_args.eval_split_name, token=data_args.token, + trust_remote_code=data_args.trust_remote_code, ) if data_args.max_eval_samples is not None: diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py index f750d74f6c3..666bf075f29 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py @@ -98,9 +98,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -347,6 +347,7 @@ def main(): split=data_args.train_split_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) if training_args.do_eval: @@ -356,6 +357,7 @@ def main(): split=data_args.eval_split_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) if data_args.audio_column_name not in next(iter(raw_datasets.values())).column_names: diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index a27fa2e5b38..7b9a126bda9 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -112,9 +112,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -397,6 +397,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index 2f9c4299e6e..2af52b935a6 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -268,12 +268,11 @@ def parse_args(): parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument( "--trust_remote_code", - type=bool, - default=False, + action="store_true", help=( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ), ) parser.add_argument( @@ -398,7 +397,9 @@ def main(): # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) + raw_datasets = load_dataset( + args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code + ) else: data_files = {} if args.train_file is not None: diff --git a/examples/pytorch/test_accelerate_examples.py b/examples/pytorch/test_accelerate_examples.py index f3695b4ad11..fe700eabdd9 100644 --- a/examples/pytorch/test_accelerate_examples.py +++ b/examples/pytorch/test_accelerate_examples.py @@ -313,6 +313,7 @@ class ExamplesTestsNoTrainer(TestCasePlus): {self.examples_dir}/pytorch/image-classification/run_image_classification_no_trainer.py --model_name_or_path google/vit-base-patch16-224-in21k --dataset_name hf-internal-testing/cats_vs_dogs_sample + --trust_remote_code --learning_rate 1e-4 --per_device_train_batch_size 2 --per_device_eval_batch_size 1 diff --git a/examples/pytorch/test_pytorch_examples.py b/examples/pytorch/test_pytorch_examples.py index dab2148a728..c609ee860c7 100644 --- a/examples/pytorch/test_pytorch_examples.py +++ b/examples/pytorch/test_pytorch_examples.py @@ -391,6 +391,7 @@ class ExamplesTests(TestCasePlus): --output_dir {tmp_dir} --model_name_or_path google/vit-base-patch16-224-in21k --dataset_name hf-internal-testing/cats_vs_dogs_sample + --trust_remote_code --do_train --do_eval --learning_rate 1e-4 @@ -424,6 +425,7 @@ class ExamplesTests(TestCasePlus): --dataset_config_name clean --train_split_name validation --eval_split_name validation + --trust_remote_code --do_train --do_eval --learning_rate 1e-4 @@ -454,6 +456,7 @@ class ExamplesTests(TestCasePlus): --dataset_config_name clean --train_split_name validation --eval_split_name validation + --trust_remote_code --do_train --do_eval --learning_rate 1e-4 @@ -486,6 +489,7 @@ class ExamplesTests(TestCasePlus): --dataset_config_name clean --train_split_name validation --eval_split_name validation + --trust_remote_code --do_train --do_eval --learning_rate 1e-4 @@ -513,6 +517,7 @@ class ExamplesTests(TestCasePlus): --output_dir {tmp_dir} --model_name_or_path hf-internal-testing/tiny-random-wav2vec2 --dataset_name anton-l/superb_demo + --trust_remote_code --dataset_config_name ks --train_split_name test --eval_split_name test @@ -547,6 +552,7 @@ class ExamplesTests(TestCasePlus): --dataset_name hf-internal-testing/librispeech_asr_dummy --dataset_config_names clean --dataset_split_names validation + --trust_remote_code --learning_rate 1e-4 --per_device_train_batch_size 4 --per_device_eval_batch_size 4 @@ -567,6 +573,7 @@ class ExamplesTests(TestCasePlus): run_mae.py --output_dir {tmp_dir} --dataset_name hf-internal-testing/cats_vs_dogs_sample + --trust_remote_code --do_train --do_eval --learning_rate 1e-4 diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py index b5da2063b65..84bd60938f1 100755 --- a/examples/pytorch/text-classification/run_classification.py +++ b/examples/pytorch/text-classification/run_classification.py @@ -240,9 +240,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -338,6 +338,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) # Try print some info about the dataset logger.info(f"Dataset loaded: {raw_datasets}") diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index c9d4ec8b10c..b566c452151 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -201,9 +201,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -300,6 +300,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: # Loading a dataset from your local files. diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index 05bdd01ef81..c9bb86588fd 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -92,9 +92,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -290,6 +290,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index dd91659433c..b7d3e3df007 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -212,12 +212,11 @@ def parse_args(): parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument( "--trust_remote_code", - type=bool, - default=False, + action="store_true", help=( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ), ) parser.add_argument( @@ -333,7 +332,9 @@ def main(): # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) + raw_datasets = load_dataset( + args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code + ) else: data_files = {} if args.train_file is not None: diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index 6e5a06b310b..35a2ab0ef23 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -102,9 +102,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -346,6 +346,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index e88102b2538..f30d12a77e8 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -76,7 +76,6 @@ def parse_args(): default=None, help="The name of the dataset to use (via the datasets library).", ) - parser.add_argument( "--predict_with_generate", type=bool, @@ -259,12 +258,11 @@ def parse_args(): parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument( "--trust_remote_code", - type=bool, - default=False, + action="store_true", help=( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ), ) parser.add_argument( @@ -378,7 +376,9 @@ def main(): # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) + raw_datasets = load_dataset( + args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code + ) else: data_files = {} if args.train_file is not None: diff --git a/examples/tensorflow/_tests_requirements.txt b/examples/tensorflow/_tests_requirements.txt index dd02b13c3d8..6971795ce4e 100644 --- a/examples/tensorflow/_tests_requirements.txt +++ b/examples/tensorflow/_tests_requirements.txt @@ -14,7 +14,7 @@ streamlit elasticsearch nltk pandas -datasets >= 1.13.3,<2.20.0 # Temporary upper version +datasets >= 1.13.3 fire pytest<8.0.1 conllu diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py index 839dc962b92..786e9800007 100644 --- a/examples/tensorflow/contrastive-image-text/run_clip.py +++ b/examples/tensorflow/contrastive-image-text/run_clip.py @@ -105,9 +105,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -326,6 +326,7 @@ def main(): keep_in_memory=False, data_dir=data_args.data_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py index e5f8c2edb7a..1cdb6ef2950 100644 --- a/examples/tensorflow/image-classification/run_image_classification.py +++ b/examples/tensorflow/image-classification/run_image_classification.py @@ -171,9 +171,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -284,6 +284,7 @@ def main(): cache_dir=model_args.cache_dir, task="image-classification", token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/tensorflow/language-modeling-tpu/prepare_tfrecord_shards.py b/examples/tensorflow/language-modeling-tpu/prepare_tfrecord_shards.py index a8bb7d37929..260f77226b1 100644 --- a/examples/tensorflow/language-modeling-tpu/prepare_tfrecord_shards.py +++ b/examples/tensorflow/language-modeling-tpu/prepare_tfrecord_shards.py @@ -42,6 +42,15 @@ def parse_args(): parser.add_argument( "--dataset_config", type=str, default="wikitext-103-raw-v1", help="Configuration name of the dataset." ) + parser.add_argument( + "--trust_remote_code", + action="store_true", + help=( + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." + ), + ) parser.add_argument( "--tokenizer_name_or_path", type=str, @@ -105,7 +114,9 @@ def get_serialized_examples(tokenized_data): def main(args): - dataset = datasets.load_dataset(args.dataset_name, args.dataset_config, split=args.split) + dataset = datasets.load_dataset( + args.dataset_name, args.dataset_config, split=args.split, trust_remote_code=args.trust_remote_code + ) if args.limit is not None: max_samples = min(len(dataset), args.limit) diff --git a/examples/tensorflow/language-modeling-tpu/train_unigram.py b/examples/tensorflow/language-modeling-tpu/train_unigram.py index a71cac45759..615f93bc1bf 100644 --- a/examples/tensorflow/language-modeling-tpu/train_unigram.py +++ b/examples/tensorflow/language-modeling-tpu/train_unigram.py @@ -41,6 +41,15 @@ def parse_args(): parser.add_argument( "--dataset_config", type=str, default="wikitext-103-raw-v1", help="Configuration name of the dataset." ) + parser.add_argument( + "--trust_remote_code", + action="store_true", + help=( + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." + ), + ) parser.add_argument( "--batch_size", type=int, @@ -69,7 +78,9 @@ def parse_args(): def main(args): - dataset = datasets.load_dataset(args.dataset_name, args.dataset_config, split="train") + dataset = datasets.load_dataset( + args.dataset_name, args.dataset_config, split="train", trust_remote_code=args.trust_remote_code + ) if args.limit is not None: max_train_samples = min(len(dataset), args.limit) diff --git a/examples/tensorflow/language-modeling/run_clm.py b/examples/tensorflow/language-modeling/run_clm.py index a75cf9bf1d3..00cfa6f7d24 100755 --- a/examples/tensorflow/language-modeling/run_clm.py +++ b/examples/tensorflow/language-modeling/run_clm.py @@ -125,9 +125,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -298,6 +298,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( @@ -306,6 +307,7 @@ def main(): split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, @@ -313,6 +315,7 @@ def main(): split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/tensorflow/language-modeling/run_mlm.py b/examples/tensorflow/language-modeling/run_mlm.py index 43b991e7fe2..9e1cded9a31 100755 --- a/examples/tensorflow/language-modeling/run_mlm.py +++ b/examples/tensorflow/language-modeling/run_mlm.py @@ -123,9 +123,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -307,6 +307,7 @@ def main(): data_args.dataset_name, data_args.dataset_config_name, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( @@ -314,12 +315,14 @@ def main(): data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py index f751fa4b430..821c8529e54 100755 --- a/examples/tensorflow/question-answering/run_qa.py +++ b/examples/tensorflow/question-answering/run_qa.py @@ -104,9 +104,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -329,6 +329,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py index a76c1897045..8aaa033c1bc 100644 --- a/examples/tensorflow/summarization/run_summarization.py +++ b/examples/tensorflow/summarization/run_summarization.py @@ -112,9 +112,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -366,6 +366,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/tensorflow/test_tensorflow_examples.py b/examples/tensorflow/test_tensorflow_examples.py index 914ea767d0f..bbb8bfa3891 100644 --- a/examples/tensorflow/test_tensorflow_examples.py +++ b/examples/tensorflow/test_tensorflow_examples.py @@ -316,6 +316,7 @@ class ExamplesTests(TestCasePlus): testargs = f""" run_image_classification.py --dataset_name hf-internal-testing/cats_vs_dogs_sample + --trust_remote_code --model_name_or_path microsoft/resnet-18 --do_train --do_eval diff --git a/examples/tensorflow/token-classification/run_ner.py b/examples/tensorflow/token-classification/run_ner.py index 54a6e7b8855..19d153108b1 100644 --- a/examples/tensorflow/token-classification/run_ner.py +++ b/examples/tensorflow/token-classification/run_ner.py @@ -88,9 +88,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -239,6 +239,7 @@ def main(): data_args.dataset_name, data_args.dataset_config_name, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py index 90f7fe01f71..f183657e49a 100644 --- a/examples/tensorflow/translation/run_translation.py +++ b/examples/tensorflow/translation/run_translation.py @@ -106,9 +106,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -333,6 +333,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/scripts/check_tokenizers.py b/scripts/check_tokenizers.py index ea0d0bc2185..6d6773b00e8 100644 --- a/scripts/check_tokenizers.py +++ b/scripts/check_tokenizers.py @@ -13,7 +13,7 @@ TOKENIZER_CLASSES = { name: (getattr(transformers, name), getattr(transformers, name + "Fast")) for name in SLOW_TO_FAST_CONVERTERS } -dataset = datasets.load_dataset("xnli", split="test+validation") +dataset = datasets.load_dataset("facebook/xnli", split="test+validation") # no-script total = 0 perfect = 0 diff --git a/setup.py b/setup.py index 74d6e64af74..4edffc724e9 100644 --- a/setup.py +++ b/setup.py @@ -102,7 +102,7 @@ _deps = [ "codecarbon==1.2.0", "cookiecutter==1.7.3", "dataclasses", - "datasets!=2.5.0,<2.20.0", # Temporary upper version + "datasets!=2.5.0", "decord==0.6.0", "deepspeed>=0.9.3", "diffusers", diff --git a/src/transformers/agents/text_to_speech.py b/src/transformers/agents/text_to_speech.py index 4e8500bcab6..3166fab8023 100644 --- a/src/transformers/agents/text_to_speech.py +++ b/src/transformers/agents/text_to_speech.py @@ -51,7 +51,9 @@ class TextToSpeechTool(PipelineTool): if not is_datasets_available(): raise ImportError("Datasets needs to be installed if not passing speaker embeddings.") - embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") + embeddings_dataset = load_dataset( + "Matthijs/cmu-arctic-xvectors", split="validation", trust_remote_code=True + ) speaker_embeddings = torch.tensor(embeddings_dataset[7305]["xvector"]).unsqueeze(0) return {"input_ids": inputs["input_ids"], "speaker_embeddings": speaker_embeddings} diff --git a/src/transformers/commands/pt_to_tf.py b/src/transformers/commands/pt_to_tf.py index 85382ac5a4f..4df45f7f086 100644 --- a/src/transformers/commands/pt_to_tf.py +++ b/src/transformers/commands/pt_to_tf.py @@ -202,7 +202,9 @@ class PTtoTFCommand(BaseTransformersCLICommand): """ def _get_audio_input(): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) speech_samples = ds.sort("id").select(range(2))[:2]["audio"] raw_samples = [x["array"] for x in speech_samples] return raw_samples @@ -234,7 +236,7 @@ class PTtoTFCommand(BaseTransformersCLICommand): } ) if "pixel_values" in model_forward_signature: - sample_images = load_dataset("cifar10", "plain_text", split="test")[:2]["img"] + sample_images = load_dataset("uoft-cs/cifar10", "plain_text", split="test")[:2]["img"] # no-script processor_inputs.update({"images": sample_images}) if "input_features" in model_forward_signature: feature_extractor_signature = inspect.signature(processor.feature_extractor).parameters diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 4a8f47fef9b..3148d0f3393 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -9,7 +9,7 @@ deps = { "codecarbon": "codecarbon==1.2.0", "cookiecutter": "cookiecutter==1.7.3", "dataclasses": "dataclasses", - "datasets": "datasets!=2.5.0,<2.20.0", + "datasets": "datasets!=2.5.0", "decord": "decord==0.6.0", "deepspeed": "deepspeed>=0.9.3", "diffusers": "diffusers", diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py index b226a059d10..c9a978f5ee8 100644 --- a/src/transformers/generation/logits_process.py +++ b/src/transformers/generation/logits_process.py @@ -1760,7 +1760,7 @@ class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor): >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt") >>> # Whisper has `begin_suppress_tokens` set by default (= `[220, 50256]`). 50256 is the EOS token, so this means @@ -1812,7 +1812,7 @@ class SuppressTokensLogitsProcessor(LogitsProcessor): >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt") >>> # Whisper has a long list of suppressed tokens. For instance, in this case, the token 1 is suppressed by default. @@ -1901,7 +1901,7 @@ class WhisperTimeStampLogitsProcessor(LogitsProcessor): >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> inputs = processor(ds[3]["audio"]["array"], return_tensors="pt") >>> input_features = inputs.input_features diff --git a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py index 2f75d07592f..d211ef7ab05 100644 --- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py +++ b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py @@ -205,7 +205,8 @@ def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_fo feature_extractor = ASTFeatureExtractor(mean=mean, std=std, max_length=max_length) if "speech-commands" in model_name: - dataset = load_dataset("speech_commands", "v0.02", split="validation") + # TODO: Convert dataset to Parquet + dataset = load_dataset("google/speech_commands", "v0.02", split="validation", trust_remote_code=True) waveform = dataset[0]["audio"]["array"] else: filepath = hf_hub_download( diff --git a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py index c2e366d7dd0..46c72a97f49 100644 --- a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py +++ b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py @@ -266,7 +266,7 @@ def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path): # Check outputs on an image if is_semantic: image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False) - ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test") + ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True) image = Image.open(ds[0]["file"]) else: image_processor = BeitImageProcessor( diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py index f73ab9e51f4..1f8d908270d 100755 --- a/src/transformers/models/big_bird/modeling_big_bird.py +++ b/src/transformers/models/big_bird/modeling_big_bird.py @@ -2409,7 +2409,7 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel): >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base") >>> model = BigBirdForMaskedLM.from_pretrained("google/bigbird-roberta-base") - >>> squad_ds = load_dataset("squad_v2", split="train") # doctest: +IGNORE_RESULT + >>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train") # doctest: +IGNORE_RESULT >>> # select random long article >>> LONG_ARTICLE_TARGET = squad_ds[81514]["context"] @@ -2711,7 +2711,7 @@ class BigBirdForSequenceClassification(BigBirdPreTrainedModel): >>> tokenizer = AutoTokenizer.from_pretrained("l-yohai/bigbird-roberta-base-mnli") >>> model = BigBirdForSequenceClassification.from_pretrained("l-yohai/bigbird-roberta-base-mnli") - >>> squad_ds = load_dataset("squad_v2", split="train") # doctest: +IGNORE_RESULT + >>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train") # doctest: +IGNORE_RESULT >>> LONG_ARTICLE = squad_ds[81514]["context"] >>> inputs = tokenizer(LONG_ARTICLE, return_tensors="pt") @@ -3040,7 +3040,7 @@ class BigBirdForQuestionAnswering(BigBirdPreTrainedModel): >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base") >>> model = BigBirdForQuestionAnswering.from_pretrained("google/bigbird-roberta-base") - >>> squad_ds = load_dataset("squad_v2", split="train") # doctest: +IGNORE_RESULT + >>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train") # doctest: +IGNORE_RESULT >>> # select random article and question >>> LONG_ARTICLE = squad_ds[81514]["context"] diff --git a/src/transformers/models/clvp/modeling_clvp.py b/src/transformers/models/clvp/modeling_clvp.py index 3a70d680573..a673d64614d 100644 --- a/src/transformers/models/clvp/modeling_clvp.py +++ b/src/transformers/models/clvp/modeling_clvp.py @@ -1681,7 +1681,7 @@ class ClvpModelForConditionalGeneration(ClvpPreTrainedModel): >>> # Define the Text and Load the Audio (We are taking an audio example from HuggingFace Hub using `datasets` library) >>> text = "This is an example text." - >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) >>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() @@ -1754,7 +1754,7 @@ class ClvpModelForConditionalGeneration(ClvpPreTrainedModel): >>> # Define the Text and Load the Audio (We are taking an audio example from HuggingFace Hub using `datasets` library) >>> text = "This is an example text." - >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) >>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() diff --git a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py index b5a30223bcb..5339f1671b0 100644 --- a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py @@ -226,7 +226,7 @@ def convert_wav2vec2_checkpoint( processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60") - ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) input_audio = [x["array"] for x in ds[:4]["audio"]] inputs = processor(input_audio, return_tensors="pt", padding=True) diff --git a/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py index 8f1a8370933..4db60e0faeb 100755 --- a/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py +++ b/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py @@ -831,7 +831,7 @@ class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel): >>> model.config.decoder_start_token_id = tokenizer.bos_token_id >>> # pre-process inputs and labels - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> inputs = feature_extractor( ... ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt" ... ) diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py index 913bf2b64b6..f6f14f6d08e 100644 --- a/src/transformers/models/donut/convert_donut_to_pytorch.py +++ b/src/transformers/models/donut/convert_donut_to_pytorch.py @@ -148,7 +148,7 @@ def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_ model.load_state_dict(new_state_dict) # verify results on scanned document - dataset = load_dataset("hf-internal-testing/example-documents") + dataset = load_dataset("hf-internal-testing/example-documents") # no-script image = dataset["test"][0]["image"].convert("RGB") tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True) diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index 4c22ca8f2b6..e66a70e0501 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -1431,7 +1431,7 @@ class HubertModel(HubertPreTrainedModel): ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> ds = ds.map(map_to_array) >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1 diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py index 2adfeea5b8b..6c2a341927e 100644 --- a/src/transformers/models/hubert/modeling_tf_hubert.py +++ b/src/transformers/models/hubert/modeling_tf_hubert.py @@ -1471,7 +1471,7 @@ class TFHubertModel(TFHubertPreTrainedModel): ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> ds = ds.map(map_to_array) >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1 @@ -1583,7 +1583,7 @@ class TFHubertForCTC(TFHubertPreTrainedModel): ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> ds = ds.map(map_to_array) >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1 diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py index 4a761fcc0d6..55e17bfc586 100644 --- a/src/transformers/models/layoutlm/modeling_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_layoutlm.py @@ -1294,7 +1294,7 @@ class LayoutLMForQuestionAnswering(LayoutLMPreTrainedModel): >>> tokenizer = AutoTokenizer.from_pretrained("impira/layoutlm-document-qa", add_prefix_space=True) >>> model = LayoutLMForQuestionAnswering.from_pretrained("impira/layoutlm-document-qa", revision="1e3ebac") - >>> dataset = load_dataset("nielsr/funsd", split="train") + >>> dataset = load_dataset("nielsr/funsd", split="train", trust_remote_code=True) >>> example = dataset[0] >>> question = "what's his name?" >>> words = example["words"] diff --git a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py index 5e95f3a3b58..59aebe15b5d 100644 --- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py @@ -1601,7 +1601,7 @@ class TFLayoutLMForQuestionAnswering(TFLayoutLMPreTrainedModel, TFQuestionAnswer >>> tokenizer = AutoTokenizer.from_pretrained("impira/layoutlm-document-qa", add_prefix_space=True) >>> model = TFLayoutLMForQuestionAnswering.from_pretrained("impira/layoutlm-document-qa", revision="1e3ebac") - >>> dataset = load_dataset("nielsr/funsd", split="train") + >>> dataset = load_dataset("nielsr/funsd", split="train", trust_remote_code=True) >>> example = dataset[0] >>> question = "what's his name?" >>> words = example["words"] diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py index dd7b249f840..50ef27be3f5 100755 --- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py @@ -838,7 +838,7 @@ class LayoutLMv2Model(LayoutLMv2PreTrainedModel): >>> model = LayoutLMv2Model.from_pretrained("microsoft/layoutlmv2-base-uncased") - >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa") + >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa", trust_remote_code=True) >>> image_path = dataset["test"][0]["file"] >>> image = Image.open(image_path).convert("RGB") @@ -1005,7 +1005,7 @@ class LayoutLMv2ForSequenceClassification(LayoutLMv2PreTrainedModel): >>> set_seed(0) - >>> dataset = load_dataset("rvl_cdip", split="train", streaming=True) + >>> dataset = load_dataset("aharley/rvl_cdip", split="train", streaming=True, trust_remote_code=True) >>> data = next(iter(dataset)) >>> image = data["image"].convert("RGB") @@ -1184,7 +1184,7 @@ class LayoutLMv2ForTokenClassification(LayoutLMv2PreTrainedModel): >>> set_seed(0) - >>> datasets = load_dataset("nielsr/funsd", split="test") + >>> datasets = load_dataset("nielsr/funsd", split="test", trust_remote_code=True) >>> labels = datasets.features["ner_tags"].feature.names >>> id2label = {v: k for v, k in enumerate(labels)} @@ -1328,7 +1328,7 @@ class LayoutLMv2ForQuestionAnswering(LayoutLMv2PreTrainedModel): >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased") >>> model = LayoutLMv2ForQuestionAnswering.from_pretrained("microsoft/layoutlmv2-base-uncased") - >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa") + >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa", trust_remote_code=True) >>> image_path = dataset["test"][0]["file"] >>> image = Image.open(image_path).convert("RGB") >>> question = "When is coffee break?" diff --git a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py index c258e9e3aff..941ff860042 100644 --- a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py @@ -859,7 +859,7 @@ class LayoutLMv3Model(LayoutLMv3PreTrainedModel): >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False) >>> model = AutoModel.from_pretrained("microsoft/layoutlmv3-base") - >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") + >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True) >>> example = dataset[0] >>> image = example["image"] >>> words = example["tokens"] @@ -1075,7 +1075,7 @@ class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel): >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False) >>> model = AutoModelForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=7) - >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") + >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True) >>> example = dataset[0] >>> image = example["image"] >>> words = example["tokens"] @@ -1191,7 +1191,7 @@ class LayoutLMv3ForQuestionAnswering(LayoutLMv3PreTrainedModel): >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False) >>> model = AutoModelForQuestionAnswering.from_pretrained("microsoft/layoutlmv3-base") - >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") + >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True) >>> example = dataset[0] >>> image = example["image"] >>> question = "what's his name?" @@ -1311,7 +1311,7 @@ class LayoutLMv3ForSequenceClassification(LayoutLMv3PreTrainedModel): >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False) >>> model = AutoModelForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base") - >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") + >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True) >>> example = dataset[0] >>> image = example["image"] >>> words = example["tokens"] diff --git a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py index 6415f432479..574e14cc910 100644 --- a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py @@ -1296,7 +1296,7 @@ class TFLayoutLMv3Model(TFLayoutLMv3PreTrainedModel): >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False) >>> model = TFAutoModel.from_pretrained("microsoft/layoutlmv3-base") - >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") + >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True) >>> example = dataset[0] >>> image = example["image"] >>> words = example["tokens"] @@ -1439,7 +1439,7 @@ class TFLayoutLMv3ForSequenceClassification(TFLayoutLMv3PreTrainedModel, TFSeque >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False) >>> model = TFAutoModelForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base") - >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") + >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True) >>> example = dataset[0] >>> image = example["image"] >>> words = example["tokens"] @@ -1566,7 +1566,7 @@ class TFLayoutLMv3ForTokenClassification(TFLayoutLMv3PreTrainedModel, TFTokenCla >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False) >>> model = TFAutoModelForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=7) - >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") + >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True) >>> example = dataset[0] >>> image = example["image"] >>> words = example["tokens"] @@ -1703,7 +1703,7 @@ class TFLayoutLMv3ForQuestionAnswering(TFLayoutLMv3PreTrainedModel, TFQuestionAn >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False) >>> model = TFAutoModelForQuestionAnswering.from_pretrained("microsoft/layoutlmv3-base") - >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") + >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True) >>> example = dataset[0] >>> image = example["image"] >>> question = "what's his name?" diff --git a/src/transformers/models/lilt/modeling_lilt.py b/src/transformers/models/lilt/modeling_lilt.py index 4e4ee12c3de..85cbcfdc4c4 100644 --- a/src/transformers/models/lilt/modeling_lilt.py +++ b/src/transformers/models/lilt/modeling_lilt.py @@ -729,7 +729,7 @@ class LiltModel(LiltPreTrainedModel): >>> tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base") >>> model = AutoModel.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base") - >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") + >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True) >>> example = dataset[0] >>> words = example["tokens"] >>> boxes = example["bboxes"] @@ -868,7 +868,7 @@ class LiltForSequenceClassification(LiltPreTrainedModel): >>> tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base") >>> model = AutoModelForSequenceClassification.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base") - >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") + >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True) >>> example = dataset[0] >>> words = example["tokens"] >>> boxes = example["bboxes"] @@ -987,7 +987,7 @@ class LiltForTokenClassification(LiltPreTrainedModel): >>> tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base") >>> model = AutoModelForTokenClassification.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base") - >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") + >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True) >>> example = dataset[0] >>> words = example["tokens"] >>> boxes = example["bboxes"] @@ -1116,7 +1116,7 @@ class LiltForQuestionAnswering(LiltPreTrainedModel): >>> tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base") >>> model = AutoModelForQuestionAnswering.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base") - >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") + >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True) >>> example = dataset[0] >>> words = example["tokens"] >>> boxes = example["bboxes"] diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py index d164622cd6a..e5097b7402b 100644 --- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py @@ -463,7 +463,7 @@ class SpeechEncoderDecoderModel(PreTrainedModel): >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15") >>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> input_values = processor(ds[0]["audio"]["array"], return_tensors="pt").input_values >>> # Inference: Translate English speech to German diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index 8353a172b21..9832987f4e6 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -1129,7 +1129,7 @@ class Speech2TextModel(Speech2TextPreTrainedModel): >>> model = Speech2TextModel.from_pretrained("facebook/s2t-small-librispeech-asr") >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/s2t-small-librispeech-asr") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> inputs = feature_extractor( ... ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt" ... ) @@ -1270,7 +1270,7 @@ class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel): >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> inputs = processor( ... ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt" diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py index bac1256ca4b..6ad680d4fc0 100755 --- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py @@ -1483,7 +1483,7 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> ds = ds.map(map_to_array) >>> ds.set_format(type="tf") diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py index 0d991bee4f0..a69e9b56ebc 100644 --- a/src/transformers/models/speecht5/modeling_speecht5.py +++ b/src/transformers/models/speecht5/modeling_speecht5.py @@ -2338,7 +2338,7 @@ class SpeechT5ForSpeechToText(SpeechT5PreTrainedModel): >>> from datasets import load_dataset >>> dataset = load_dataset( - ... "hf-internal-testing/librispeech_asr_demo", "clean", split="validation" + ... "hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True ... ) # doctest: +IGNORE_RESULT >>> dataset = dataset.sort("id") >>> sampling_rate = dataset.features["audio"].sampling_rate @@ -3024,7 +3024,7 @@ class SpeechT5ForSpeechToSpeech(SpeechT5PreTrainedModel): >>> import torch >>> dataset = load_dataset( - ... "hf-internal-testing/librispeech_asr_demo", "clean", split="validation" + ... "hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True ... ) # doctest: +IGNORE_RESULT >>> dataset = dataset.sort("id") >>> sampling_rate = dataset.features["audio"].sampling_rate diff --git a/src/transformers/models/udop/modeling_udop.py b/src/transformers/models/udop/modeling_udop.py index 4b170c1023a..972248daaae 100644 --- a/src/transformers/models/udop/modeling_udop.py +++ b/src/transformers/models/udop/modeling_udop.py @@ -1602,7 +1602,7 @@ class UdopModel(UdopPreTrainedModel): >>> # load an example image, along with the words and coordinates >>> # which were extracted using an OCR engine - >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") + >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True) >>> example = dataset[0] >>> image = example["image"] >>> words = example["tokens"] @@ -1781,7 +1781,7 @@ class UdopForConditionalGeneration(UdopPreTrainedModel): >>> # load an example image, along with the words and coordinates >>> # which were extracted using an OCR engine - >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") + >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True) >>> example = dataset[0] >>> image = example["image"] >>> words = example["tokens"] @@ -2009,7 +2009,7 @@ class UdopEncoderModel(UdopPreTrainedModel): >>> # load an example image, along with the words and coordinates >>> # which were extracted using an OCR engine - >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") + >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True) >>> example = dataset[0] >>> image = example["image"] >>> words = example["tokens"] diff --git a/src/transformers/models/univnet/modeling_univnet.py b/src/transformers/models/univnet/modeling_univnet.py index 5b0c659c302..887493fdcf5 100644 --- a/src/transformers/models/univnet/modeling_univnet.py +++ b/src/transformers/models/univnet/modeling_univnet.py @@ -525,7 +525,7 @@ class UnivNetModel(PreTrainedModel): >>> model = UnivNetModel.from_pretrained("dg845/univnet-dev") >>> feature_extractor = UnivNetFeatureExtractor.from_pretrained("dg845/univnet-dev") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> # Resample the audio to the feature extractor's sampling rate. >>> ds = ds.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate)) >>> inputs = feature_extractor( diff --git a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py index 9a24b9d39fd..7a629e24572 100644 --- a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py @@ -1076,7 +1076,7 @@ FLAX_WAV2VEC2_MODEL_DOCSTRING = """ ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> ds = ds.map(map_to_array) >>> input_values = processor( @@ -1195,7 +1195,7 @@ FLAX_WAV2VEC2_FOR_CTC_DOCSTRING = """ ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> ds = ds.map(map_to_array) >>> input_values = processor( @@ -1396,7 +1396,7 @@ FLAX_WAV2VEC2_FOR_PRETRAINING_DOCSTRING = """ ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> ds = ds.map(map_to_array) >>> input_values = feature_extractor(ds["speech"][0], return_tensors="np").input_values # Batch size 1 diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py index a8338e363d9..cc8478d5b3c 100644 --- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py @@ -1542,7 +1542,7 @@ class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel): ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> ds = ds.map(map_to_array) >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1 @@ -1654,7 +1654,7 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel): ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> ds = ds.map(map_to_array) >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1 diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index f9976f37e8e..476d4e7dd1e 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -2045,7 +2045,7 @@ class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel): >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") >>> model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values # Batch size 1 >>> # compute masked indices diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py index 34848a841e9..647b18521d0 100644 --- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py +++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py @@ -590,7 +590,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer): >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h") >>> # load first sample of English common_voice - >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True) + >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True, trust_remote_code=True) >>> dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000)) >>> dataset_iter = iter(dataset) >>> sample = next(dataset_iter) diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py index c37dd980d4e..6f631e4683a 100644 --- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py @@ -1453,7 +1453,7 @@ class Wav2Vec2ConformerForPreTraining(Wav2Vec2ConformerPreTrainedModel): >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large") >>> model = Wav2Vec2ConformerForPreTraining.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values # Batch size 1 >>> # compute masked indices diff --git a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py index 8a0c3d31e96..410fe710194 100644 --- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py +++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py @@ -545,7 +545,7 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin): >>> processor = AutoProcessor.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm") >>> # load first sample of English common_voice - >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True) + >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True, trust_remote_code=True) >>> dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000)) >>> dataset_iter = iter(dataset) >>> sample = next(dataset_iter) diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py index 4d60427d8bd..fc572c7389c 100644 --- a/src/transformers/models/whisper/generation_whisper.py +++ b/src/transformers/models/whisper/generation_whisper.py @@ -461,7 +461,7 @@ class WhisperGenerationMixin: >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt") >>> input_features = inputs.input_features diff --git a/src/transformers/models/whisper/modeling_flax_whisper.py b/src/transformers/models/whisper/modeling_flax_whisper.py index 8ad4ed9de24..9da592c107d 100644 --- a/src/transformers/models/whisper/modeling_flax_whisper.py +++ b/src/transformers/models/whisper/modeling_flax_whisper.py @@ -985,7 +985,7 @@ class FlaxWhisperPreTrainedModel(FlaxPreTrainedModel): >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True) - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np") >>> input_features = inputs.input_features >>> encoder_outputs = model.encode(input_features=input_features) @@ -1045,7 +1045,7 @@ class FlaxWhisperPreTrainedModel(FlaxPreTrainedModel): >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True) - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> input_features = processor(ds[0]["audio"]["array"], return_tensors="np").input_features >>> encoder_outputs = model.encode(input_features=input_features) @@ -1297,7 +1297,7 @@ class FlaxWhisperForConditionalGeneration(FlaxWhisperPreTrainedModel): >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True) - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np") >>> input_features = inputs.input_features >>> encoder_outputs = model.encode(input_features=input_features) @@ -1516,7 +1516,7 @@ FLAX_WHISPER_CONDITIONAL_GENERATION_DOCSTRING = r""" >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True) - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np") >>> input_features = inputs.input_features >>> generated_ids = model.generate(input_ids=input_features) @@ -1670,7 +1670,7 @@ FLAX_WHISPER_AUDIO_CLASSIFICATION_DOCSTRING = r""" >>> model = FlaxWhisperForAudioClassification.from_pretrained( ... "sanchit-gandhi/whisper-medium-fleurs-lang-id", from_pt=True ... ) - >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True) + >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True, trust_remote_code=True) >>> sample = next(iter(ds)) diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py index 18f55dce8a2..6f50141bff9 100644 --- a/src/transformers/models/whisper/modeling_tf_whisper.py +++ b/src/transformers/models/whisper/modeling_tf_whisper.py @@ -1147,7 +1147,7 @@ class TFWhisperMainLayer(keras.layers.Layer): >>> model = TFWhisperModel.from_pretrained("openai/whisper-base") >>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="tf") >>> input_features = inputs.input_features >>> decoder_input_ids = tf.convert_to_tensor([[1, 1]]) * model.config.decoder_start_token_id @@ -1283,7 +1283,7 @@ class TFWhisperModel(TFWhisperPreTrainedModel): >>> model = TFWhisperModel.from_pretrained("openai/whisper-base") >>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="tf") >>> input_features = inputs.input_features >>> decoder_input_ids = tf.convert_to_tensor([[1, 1]]) * model.config.decoder_start_token_id @@ -1413,7 +1413,7 @@ class TFWhisperForConditionalGeneration(TFWhisperPreTrainedModel, TFCausalLangua >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = TFWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="tf") >>> input_features = inputs.input_features diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index d2a7107c1ee..aedc0c43aca 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -1590,7 +1590,7 @@ class WhisperModel(WhisperPreTrainedModel): >>> model = WhisperModel.from_pretrained("openai/whisper-base") >>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt") >>> input_features = inputs.input_features >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id @@ -1731,7 +1731,7 @@ class WhisperForConditionalGeneration(WhisperGenerationMixin, WhisperPreTrainedM >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt") >>> input_features = inputs.input_features @@ -1983,7 +1983,7 @@ class WhisperForCausalLM(WhisperPreTrainedModel): >>> assistant_model = WhisperForCausalLM.from_pretrained("distil-whisper/distil-large-v2") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) >>> sample = ds[0]["audio"] >>> input_features = processor( ... sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt" diff --git a/src/transformers/utils/doc.py b/src/transformers/utils/doc.py index 23679f31a3e..7ca1c134714 100644 --- a/src/transformers/utils/doc.py +++ b/src/transformers/utils/doc.py @@ -385,7 +385,7 @@ PT_SPEECH_BASE_MODEL_SAMPLE = r""" >>> import torch >>> from datasets import load_dataset - >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") + >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True) >>> dataset = dataset.sort("id") >>> sampling_rate = dataset.features["audio"].sampling_rate @@ -411,7 +411,7 @@ PT_SPEECH_CTC_SAMPLE = r""" >>> from datasets import load_dataset >>> import torch - >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") + >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True) >>> dataset = dataset.sort("id") >>> sampling_rate = dataset.features["audio"].sampling_rate @@ -446,7 +446,7 @@ PT_SPEECH_SEQ_CLASS_SAMPLE = r""" >>> from datasets import load_dataset >>> import torch - >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") + >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True) >>> dataset = dataset.sort("id") >>> sampling_rate = dataset.features["audio"].sampling_rate @@ -482,7 +482,7 @@ PT_SPEECH_FRAME_CLASS_SAMPLE = r""" >>> from datasets import load_dataset >>> import torch - >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") + >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True) >>> dataset = dataset.sort("id") >>> sampling_rate = dataset.features["audio"].sampling_rate @@ -511,7 +511,7 @@ PT_SPEECH_XVECTOR_SAMPLE = r""" >>> from datasets import load_dataset >>> import torch - >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") + >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True) >>> dataset = dataset.sort("id") >>> sampling_rate = dataset.features["audio"].sampling_rate @@ -546,7 +546,7 @@ PT_VISION_BASE_MODEL_SAMPLE = r""" >>> import torch >>> from datasets import load_dataset - >>> dataset = load_dataset("huggingface/cats-image") + >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True) >>> image = dataset["test"]["image"][0] >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}") @@ -571,7 +571,7 @@ PT_VISION_SEQ_CLASS_SAMPLE = r""" >>> import torch >>> from datasets import load_dataset - >>> dataset = load_dataset("huggingface/cats-image") + >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True) >>> image = dataset["test"]["image"][0] >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}") @@ -803,7 +803,7 @@ TF_SPEECH_BASE_MODEL_SAMPLE = r""" >>> from transformers import AutoProcessor, {model_class} >>> from datasets import load_dataset - >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") + >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True) >>> dataset = dataset.sort("id") >>> sampling_rate = dataset.features["audio"].sampling_rate @@ -828,7 +828,7 @@ TF_SPEECH_CTC_SAMPLE = r""" >>> from datasets import load_dataset >>> import tensorflow as tf - >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") + >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True) >>> dataset = dataset.sort("id") >>> sampling_rate = dataset.features["audio"].sampling_rate @@ -863,7 +863,7 @@ TF_VISION_BASE_MODEL_SAMPLE = r""" >>> from transformers import AutoImageProcessor, {model_class} >>> from datasets import load_dataset - >>> dataset = load_dataset("huggingface/cats-image") + >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True) >>> image = dataset["test"]["image"][0] >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}") @@ -886,7 +886,7 @@ TF_VISION_SEQ_CLASS_SAMPLE = r""" >>> import tensorflow as tf >>> from datasets import load_dataset - >>> dataset = load_dataset("huggingface/cats-image") + >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True) >>> image = dataset["test"]["image"][0] >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}") diff --git a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py index adc2d87ec31..5c39262cc93 100755 --- a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py +++ b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py @@ -128,9 +128,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -274,7 +274,11 @@ def main(): # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) + raw_datasets = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + trust_remote_code=model_args.trust_remote_code, + ) else: data_files = {} if data_args.train_file is not None: @@ -568,6 +572,15 @@ def parse_args(): default=None, help= "The configuration name of the dataset to use (via the datasets library).", ) + parser.add_argument( + "--trust_remote_code", + action="store_true", + help=( + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." + ), + ) parser.add_argument( "--train_file", type=str, default=None, help="A csv or a json file containing the training data." ) @@ -725,7 +738,9 @@ def main(): # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) + raw_datasets = load_dataset( + args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code + ) else: data_files = {} if args.train_file is not None: diff --git a/tests/deepspeed/test_model_zoo.py b/tests/deepspeed/test_model_zoo.py index 043b25d6937..dba3e18abbd 100644 --- a/tests/deepspeed/test_model_zoo.py +++ b/tests/deepspeed/test_model_zoo.py @@ -269,6 +269,7 @@ def make_task_cmds(): "img_clas": f""" {scripts_dir}/image-classification/run_image_classification.py --dataset_name hf-internal-testing/cats_vs_dogs_sample + --trust_remote_code --remove_unused_columns False --max_steps 10 --image_processor_name {DS_TESTS_DIRECTORY}/vit_feature_extractor.json diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py index fbe25090863..967f1936215 100644 --- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py +++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py @@ -153,7 +153,9 @@ class ASTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Test def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/beit/test_image_processing_beit.py b/tests/models/beit/test_image_processing_beit.py index e91517b3dbe..526a78a563e 100644 --- a/tests/models/beit/test_image_processing_beit.py +++ b/tests/models/beit/test_image_processing_beit.py @@ -96,7 +96,7 @@ class BeitImageProcessingTester(unittest.TestCase): def prepare_semantic_single_inputs(): - dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test") + dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True) image = Image.open(dataset[0]["file"]) map = Image.open(dataset[1]["file"]) @@ -105,7 +105,7 @@ def prepare_semantic_single_inputs(): def prepare_semantic_batch_inputs(): - ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test") + ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True) image1 = Image.open(ds[0]["file"]) map1 = Image.open(ds[1]["file"]) diff --git a/tests/models/beit/test_modeling_beit.py b/tests/models/beit/test_modeling_beit.py index 84ca3cdd621..0e3e3e32d27 100644 --- a/tests/models/beit/test_modeling_beit.py +++ b/tests/models/beit/test_modeling_beit.py @@ -484,7 +484,7 @@ class BeitModelIntegrationTest(unittest.TestCase): image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False) - ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test") + ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True) image = Image.open(ds[0]["file"]) inputs = image_processor(images=image, return_tensors="pt").to(torch_device) @@ -527,7 +527,7 @@ class BeitModelIntegrationTest(unittest.TestCase): image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False) - ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test") + ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True) image = Image.open(ds[0]["file"]) inputs = image_processor(images=image, return_tensors="pt").to(torch_device) diff --git a/tests/models/bloom/test_tokenization_bloom.py b/tests/models/bloom/test_tokenization_bloom.py index fec0f83af90..7a62f8f1a7f 100644 --- a/tests/models/bloom/test_tokenization_bloom.py +++ b/tests/models/bloom/test_tokenization_bloom.py @@ -123,7 +123,7 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - https://huggingface.co/bigscience/tokenizer/ """ tokenizer = self.get_rust_tokenizer() - ds = load_dataset("xnli", "all_languages", split="test", streaming=True) + ds = load_dataset("facebook/xnli", "all_languages", split="test", streaming=True) sample_data = next(iter(ds))["premise"] # pick up one data input_text = list(sample_data.values()) diff --git a/tests/models/clap/test_feature_extraction_clap.py b/tests/models/clap/test_feature_extraction_clap.py index d0e913df828..8f2d6df3cb6 100644 --- a/tests/models/clap/test_feature_extraction_clap.py +++ b/tests/models/clap/test_feature_extraction_clap.py @@ -164,7 +164,9 @@ class ClapFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Tes # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest._load_datasamples def _load_datasamples(self, num_samples): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py index 1255a31b819..7cb558b97a9 100644 --- a/tests/models/clap/test_modeling_clap.py +++ b/tests/models/clap/test_modeling_clap.py @@ -665,7 +665,9 @@ class ClapModelIntegrationTest(unittest.TestCase): "repeat": 0.0023, } - librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + librispeech_dummy = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) audio_sample = librispeech_dummy[-1] model_id = "laion/clap-htsat-unfused" @@ -692,7 +694,9 @@ class ClapModelIntegrationTest(unittest.TestCase): "pad": -0.000379, } - librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + librispeech_dummy = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) audio_sample = librispeech_dummy[-1] model_id = "laion/clap-htsat-fused" @@ -719,7 +723,9 @@ class ClapModelIntegrationTest(unittest.TestCase): "pad": 0.0006, } - librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + librispeech_dummy = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]] model_id = "laion/clap-htsat-fused" @@ -746,7 +752,9 @@ class ClapModelIntegrationTest(unittest.TestCase): "pad": 0.0019, } - librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + librispeech_dummy = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]] model_id = "laion/clap-htsat-unfused" diff --git a/tests/models/clvp/test_feature_extraction_clvp.py b/tests/models/clvp/test_feature_extraction_clvp.py index db641eaf614..83be97e8675 100644 --- a/tests/models/clvp/test_feature_extraction_clvp.py +++ b/tests/models/clvp/test_feature_extraction_clvp.py @@ -209,7 +209,9 @@ class ClvpFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Tes self.assertTrue(pt_processed.input_features.dtype == torch.float32) def _load_datasamples(self, num_samples): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) ds = ds.cast_column("audio", Audio(sampling_rate=22050)) # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py index 0cf89a74523..5d17d3fed62 100644 --- a/tests/models/clvp/test_modeling_clvp.py +++ b/tests/models/clvp/test_modeling_clvp.py @@ -371,7 +371,9 @@ class ClvpModelForConditionalGenerationTester: def prepare_config_and_inputs(self): _, input_ids, attention_mask = self.clvp_encoder_tester.prepare_config_and_inputs() - ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = datasets.load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() @@ -553,7 +555,9 @@ class ClvpModelForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase) class ClvpIntegrationTest(unittest.TestCase): def setUp(self): self.text = "This is an example text." - ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = datasets.load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) _, self.speech_samples, self.sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() diff --git a/tests/models/code_llama/test_tokenization_code_llama.py b/tests/models/code_llama/test_tokenization_code_llama.py index 2a71ded72a5..fd4b38a17ec 100644 --- a/tests/models/code_llama/test_tokenization_code_llama.py +++ b/tests/models/code_llama/test_tokenization_code_llama.py @@ -493,7 +493,7 @@ class LlamaIntegrationTest(unittest.TestCase): pyth_tokenizer = self.tokenizer rust_tokenizer = self.rust_tokenizer - dataset = load_dataset("code_x_glue_ct_code_to_text", "go") + dataset = load_dataset("google/code_x_glue_ct_code_to_text", "go") for item in tqdm.tqdm(dataset["validation"]): string = item["code"] encoded1 = pyth_tokenizer.encode(string) @@ -506,7 +506,7 @@ class LlamaIntegrationTest(unittest.TestCase): self.assertEqual(decoded1, decoded2) - dataset = load_dataset("xnli", "all_languages") + dataset = load_dataset("facebook/xnli", "all_languages") for item in tqdm.tqdm(dataset["train"]): for string in item["premise"].values(): diff --git a/tests/models/data2vec/test_modeling_data2vec_audio.py b/tests/models/data2vec/test_modeling_data2vec_audio.py index d2e0e2853f0..8e9fb0d82fd 100644 --- a/tests/models/data2vec/test_modeling_data2vec_audio.py +++ b/tests/models/data2vec/test_modeling_data2vec_audio.py @@ -697,7 +697,9 @@ class Data2VecAudioUtilsTest(unittest.TestCase): @slow class Data2VecAudioModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] @@ -706,7 +708,7 @@ class Data2VecAudioModelIntegrationTest(unittest.TestCase): return [x["array"] for x in speech_samples] def _load_superb(self, task, num_samples): - ds = load_dataset("anton-l/superb_dummy", task, split="test") + ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True) return ds[:num_samples] diff --git a/tests/models/encodec/test_feature_extraction_encodec.py b/tests/models/encodec/test_feature_extraction_encodec.py index e56517ac410..73c5019b11e 100644 --- a/tests/models/encodec/test_feature_extraction_encodec.py +++ b/tests/models/encodec/test_feature_extraction_encodec.py @@ -138,7 +138,9 @@ class EnCodecFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest. def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py index be3d7161c64..f720327ec71 100644 --- a/tests/models/encodec/test_modeling_encodec.py +++ b/tests/models/encodec/test_modeling_encodec.py @@ -462,7 +462,9 @@ class EncodecIntegrationTest(unittest.TestCase): "1.5": [371955], "24.0": [6659962], } - librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + librispeech_dummy = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) model_id = "facebook/encodec_24khz" model = EncodecModel.from_pretrained(model_id).to(torch_device) @@ -516,7 +518,9 @@ class EncodecIntegrationTest(unittest.TestCase): "3.0": [144259, 146765, 156435, 176871, 161971], "24.0": [1568553, 1294948, 1306190, 1464747, 1663150], } - librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + librispeech_dummy = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) model_id = "facebook/encodec_48khz" model = EncodecModel.from_pretrained(model_id).to(torch_device) @@ -578,7 +582,9 @@ class EncodecIntegrationTest(unittest.TestCase): [85561, 81870, 76953, 48967, 79315, 85442, 81479, 107241], ], } - librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + librispeech_dummy = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) model_id = "facebook/encodec_48khz" model = EncodecModel.from_pretrained(model_id).to(torch_device) diff --git a/tests/models/gemma/test_tokenization_gemma.py b/tests/models/gemma/test_tokenization_gemma.py index 4f755d81601..d36f1b7dc17 100644 --- a/tests/models/gemma/test_tokenization_gemma.py +++ b/tests/models/gemma/test_tokenization_gemma.py @@ -314,7 +314,7 @@ class GemmaIntegrationTest(unittest.TestCase): pyth_tokenizer = self.tokenizer rust_tokenizer = self.rust_tokenizer - dataset = load_dataset("code_x_glue_ct_code_to_text", "go") + dataset = load_dataset("google/code_x_glue_ct_code_to_text", "go") for item in tqdm.tqdm(dataset["validation"]): string = item["code"] encoded1 = pyth_tokenizer.encode(string) @@ -333,7 +333,7 @@ class GemmaIntegrationTest(unittest.TestCase): self.assertEqual(decoded1, decoded2) - dataset = load_dataset("xnli", "all_languages") + dataset = load_dataset("facebook/xnli", "all_languages") for item in tqdm.tqdm(dataset["train"]): for string in item["premise"].values(): diff --git a/tests/models/hubert/test_modeling_hubert.py b/tests/models/hubert/test_modeling_hubert.py index 7bf19757fa3..b040c57082f 100644 --- a/tests/models/hubert/test_modeling_hubert.py +++ b/tests/models/hubert/test_modeling_hubert.py @@ -757,7 +757,9 @@ class HubertModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] @@ -768,7 +770,7 @@ class HubertModelIntegrationTest(unittest.TestCase): def _load_superb(self, task, num_samples): from datasets import load_dataset - ds = load_dataset("anton-l/superb_dummy", task, split="test") + ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True) return ds[:num_samples] diff --git a/tests/models/hubert/test_modeling_tf_hubert.py b/tests/models/hubert/test_modeling_tf_hubert.py index 3685e659874..35a8d98c233 100644 --- a/tests/models/hubert/test_modeling_tf_hubert.py +++ b/tests/models/hubert/test_modeling_tf_hubert.py @@ -609,7 +609,9 @@ class TFHubertModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py index 4413c8d756b..5e213e0a364 100644 --- a/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py @@ -103,7 +103,7 @@ class LayoutLMv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase) from datasets import load_dataset - ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test") + ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True) image = Image.open(ds[0]["file"]).convert("RGB") diff --git a/tests/models/layoutlmv2/test_processor_layoutlmv2.py b/tests/models/layoutlmv2/test_processor_layoutlmv2.py index 61d8e2e195d..642eac6ba47 100644 --- a/tests/models/layoutlmv2/test_processor_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_processor_layoutlmv2.py @@ -167,7 +167,7 @@ class LayoutLMv2ProcessorTest(unittest.TestCase): from datasets import load_dataset # set up - datasets = load_dataset("nielsr/funsd") + datasets = load_dataset("nielsr/funsd", trust_remote_code=True) processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr") def preprocess_data(examples): @@ -203,7 +203,7 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase): # we verify our implementation on 2 document images from the DocVQA dataset from datasets import load_dataset - ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test") + ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True) image_1 = Image.open(ds[0]["file"]).convert("RGB") image_2 = Image.open(ds[1]["file"]).convert("RGB") diff --git a/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py b/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py index a12fb6af0d5..2e853653a49 100644 --- a/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py @@ -102,7 +102,7 @@ class LayoutLMv3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase) from datasets import load_dataset - ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test") + ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True) image = Image.open(ds[0]["file"]).convert("RGB") diff --git a/tests/models/layoutlmv3/test_processor_layoutlmv3.py b/tests/models/layoutlmv3/test_processor_layoutlmv3.py index 0c7e0d666d8..640eb92ea85 100644 --- a/tests/models/layoutlmv3/test_processor_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_processor_layoutlmv3.py @@ -183,7 +183,7 @@ class LayoutLMv3ProcessorIntegrationTests(unittest.TestCase): # we verify our implementation on 2 document images from the DocVQA dataset from datasets import load_dataset - ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test") + ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True) image_1 = Image.open(ds[0]["file"]).convert("RGB") image_2 = Image.open(ds[1]["file"]).convert("RGB") diff --git a/tests/models/layoutxlm/test_processor_layoutxlm.py b/tests/models/layoutxlm/test_processor_layoutxlm.py index 240c2ae05c2..98f6f07e38e 100644 --- a/tests/models/layoutxlm/test_processor_layoutxlm.py +++ b/tests/models/layoutxlm/test_processor_layoutxlm.py @@ -160,7 +160,7 @@ class LayoutXLMProcessorTest(unittest.TestCase): from datasets import load_dataset # set up - datasets = load_dataset("nielsr/funsd") + datasets = load_dataset("nielsr/funsd", trust_remote_code=True) processor = LayoutXLMProcessor.from_pretrained("microsoft/layoutxlm-base", apply_ocr=False) def preprocess_data(examples): @@ -198,7 +198,7 @@ class LayoutXLMProcessorIntegrationTests(unittest.TestCase): # we verify our implementation on 2 document images from the DocVQA dataset from datasets import load_dataset - ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test") + ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True) image_1 = Image.open(ds[0]["file"]).convert("RGB") image_2 = Image.open(ds[1]["file"]).convert("RGB") diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py index fba883513f8..a41774e9f5d 100644 --- a/tests/models/llama/test_tokenization_llama.py +++ b/tests/models/llama/test_tokenization_llama.py @@ -514,7 +514,7 @@ class LlamaIntegrationTest(unittest.TestCase): pyth_tokenizer = self.tokenizer rust_tokenizer = self.rust_tokenizer - dataset = load_dataset("code_x_glue_ct_code_to_text", "go") + dataset = load_dataset("google/code_x_glue_ct_code_to_text", "go") for item in tqdm.tqdm(dataset["validation"]): string = item["code"] encoded1 = pyth_tokenizer.encode(string) @@ -527,7 +527,7 @@ class LlamaIntegrationTest(unittest.TestCase): self.assertEqual(decoded1, decoded2) - dataset = load_dataset("xnli", "all_languages") + dataset = load_dataset("facebook/xnli", "all_languages") for item in tqdm.tqdm(dataset["train"]): for string in item["premise"].values(): diff --git a/tests/models/mobilevit/test_image_processing_mobilevit.py b/tests/models/mobilevit/test_image_processing_mobilevit.py index 9895befc8f4..8849839a097 100644 --- a/tests/models/mobilevit/test_image_processing_mobilevit.py +++ b/tests/models/mobilevit/test_image_processing_mobilevit.py @@ -87,7 +87,7 @@ class MobileViTImageProcessingTester(unittest.TestCase): def prepare_semantic_single_inputs(): - dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test") + dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True) image = Image.open(dataset[0]["file"]) map = Image.open(dataset[1]["file"]) @@ -96,7 +96,7 @@ def prepare_semantic_single_inputs(): def prepare_semantic_batch_inputs(): - dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test") + dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True) image1 = Image.open(dataset[0]["file"]) map1 = Image.open(dataset[1]["file"]) diff --git a/tests/models/perceiver/test_modeling_perceiver.py b/tests/models/perceiver/test_modeling_perceiver.py index 86a1f154f1c..a6a9ccdc4c8 100644 --- a/tests/models/perceiver/test_modeling_perceiver.py +++ b/tests/models/perceiver/test_modeling_perceiver.py @@ -851,7 +851,7 @@ def prepare_img(): # Helper functions for optical flow integration test def prepare_optical_flow_images(): - dataset = load_dataset("hf-internal-testing/fixtures_sintel", split="test") + dataset = load_dataset("hf-internal-testing/fixtures_sintel", split="test", trust_remote_code=True) image1 = Image.open(dataset[0]["file"]).convert("RGB") image2 = Image.open(dataset[0]["file"]).convert("RGB") diff --git a/tests/models/pop2piano/test_feature_extraction_pop2piano.py b/tests/models/pop2piano/test_feature_extraction_pop2piano.py index fcc424fbf6e..263006c670f 100644 --- a/tests/models/pop2piano/test_feature_extraction_pop2piano.py +++ b/tests/models/pop2piano/test_feature_extraction_pop2piano.py @@ -136,7 +136,9 @@ class Pop2PianoFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittes self.assertTrue(input_features.extrapolated_beatstep.ndim == 2) def test_integration(self): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) speech_samples = ds.sort("id").select([0])["audio"] input_speech = [x["array"] for x in speech_samples][0] sampling_rate = [x["sampling_rate"] for x in speech_samples][0] diff --git a/tests/models/pop2piano/test_processor_pop2piano.py b/tests/models/pop2piano/test_processor_pop2piano.py index 06a8bacfd8a..634cdd26bd1 100644 --- a/tests/models/pop2piano/test_processor_pop2piano.py +++ b/tests/models/pop2piano/test_processor_pop2piano.py @@ -111,7 +111,9 @@ class Pop2PianoProcessorTest(unittest.TestCase): def get_inputs(self): """get inputs for both feature extractor and tokenizer""" - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) speech_samples = ds.sort("id").select([0])["audio"] input_speech = [x["array"] for x in speech_samples][0] sampling_rate = [x["sampling_rate"] for x in speech_samples][0] diff --git a/tests/models/seamless_m4t/test_feature_extraction_seamless_m4t.py b/tests/models/seamless_m4t/test_feature_extraction_seamless_m4t.py index a8fca4b90ba..d9919e0adea 100644 --- a/tests/models/seamless_m4t/test_feature_extraction_seamless_m4t.py +++ b/tests/models/seamless_m4t/test_feature_extraction_seamless_m4t.py @@ -258,7 +258,9 @@ class SeamlessM4TFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt self.assertTrue(pt_processed.input_features.dtype == torch.float32) def _load_datasample(self, id): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_sample = ds.sort("id")[id]["audio"]["array"] diff --git a/tests/models/segformer/test_image_processing_segformer.py b/tests/models/segformer/test_image_processing_segformer.py index 988843b710f..22399300018 100644 --- a/tests/models/segformer/test_image_processing_segformer.py +++ b/tests/models/segformer/test_image_processing_segformer.py @@ -87,7 +87,7 @@ class SegformerImageProcessingTester(unittest.TestCase): def prepare_semantic_single_inputs(): - dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test") + dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True) image = Image.open(dataset[0]["file"]) map = Image.open(dataset[1]["file"]) @@ -96,7 +96,7 @@ def prepare_semantic_single_inputs(): def prepare_semantic_batch_inputs(): - dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test") + dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True) image1 = Image.open(dataset[0]["file"]) map1 = Image.open(dataset[1]["file"]) diff --git a/tests/models/sew/test_modeling_sew.py b/tests/models/sew/test_modeling_sew.py index fd8ba9725e8..ebe0ef30a38 100644 --- a/tests/models/sew/test_modeling_sew.py +++ b/tests/models/sew/test_modeling_sew.py @@ -497,7 +497,9 @@ class SEWModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/sew_d/test_modeling_sew_d.py b/tests/models/sew_d/test_modeling_sew_d.py index 7e907f40b48..c7b6e30b189 100644 --- a/tests/models/sew_d/test_modeling_sew_d.py +++ b/tests/models/sew_d/test_modeling_sew_d.py @@ -511,7 +511,9 @@ class SEWDModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py index 9023e8467f7..6c8861e3d86 100644 --- a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py +++ b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py @@ -259,7 +259,9 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/speech_to_text/test_modeling_speech_to_text.py b/tests/models/speech_to_text/test_modeling_speech_to_text.py index 183120b8206..ddd83974e82 100644 --- a/tests/models/speech_to_text/test_modeling_speech_to_text.py +++ b/tests/models/speech_to_text/test_modeling_speech_to_text.py @@ -791,7 +791,9 @@ class Speech2TextModelIntegrationTests(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py b/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py index c2fd215f388..d1217453339 100644 --- a/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py +++ b/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py @@ -587,7 +587,9 @@ class TFSpeech2TextModelIntegrationTests(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/speecht5/test_feature_extraction_speecht5.py b/tests/models/speecht5/test_feature_extraction_speecht5.py index 5ec632e7e76..f8f7f53cac2 100644 --- a/tests/models/speecht5/test_feature_extraction_speecht5.py +++ b/tests/models/speecht5/test_feature_extraction_speecht5.py @@ -380,7 +380,9 @@ class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py index 87ad1589d9a..06afc90c2cc 100644 --- a/tests/models/speecht5/test_modeling_speecht5.py +++ b/tests/models/speecht5/test_modeling_speecht5.py @@ -741,7 +741,9 @@ class SpeechT5ForSpeechToTextIntegrationTests(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] @@ -1763,7 +1765,9 @@ class SpeechT5ForSpeechToSpeechIntegrationTests(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py index ed753612fc3..82d349e39cc 100644 --- a/tests/models/t5/test_tokenization_t5.py +++ b/tests/models/t5/test_tokenization_t5.py @@ -598,7 +598,7 @@ class CommonSpmIntegrationTests(unittest.TestCase): from datasets import load_dataset from seqio import SentencePieceVocabulary - ds = load_dataset("xnli", "all_languages", split="train+test+validation") + ds = load_dataset("facebook/xnli", "all_languages", split="train+test+validation") # TODO @ArthurZucker fix the 3 commented tests with #23909 input_texts = [ diff --git a/tests/models/udop/test_processor_udop.py b/tests/models/udop/test_processor_udop.py index ceb5f1e3318..eaa8ca7e5af 100644 --- a/tests/models/udop/test_processor_udop.py +++ b/tests/models/udop/test_processor_udop.py @@ -185,7 +185,7 @@ class UdopProcessorTest(unittest.TestCase): from datasets import load_dataset # set up - datasets = load_dataset("nielsr/funsd") + datasets = load_dataset("nielsr/funsd", trust_remote_code=True) processor = UdopProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False) def preprocess_data(examples): @@ -223,7 +223,7 @@ class UdopProcessorIntegrationTests(unittest.TestCase): # we verify our implementation on 2 document images from the DocVQA dataset from datasets import load_dataset - ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test") + ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True) image_1 = Image.open(ds[0]["file"]).convert("RGB") image_2 = Image.open(ds[1]["file"]).convert("RGB") diff --git a/tests/models/unispeech/test_modeling_unispeech.py b/tests/models/unispeech/test_modeling_unispeech.py index 79532cef853..a8931617c5e 100644 --- a/tests/models/unispeech/test_modeling_unispeech.py +++ b/tests/models/unispeech/test_modeling_unispeech.py @@ -548,7 +548,9 @@ class UniSpeechRobustModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.T @slow class UniSpeechModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] @@ -557,7 +559,7 @@ class UniSpeechModelIntegrationTest(unittest.TestCase): return [x["array"] for x in speech_samples] def _load_superb(self, task, num_samples): - ds = load_dataset("anton-l/superb_dummy", task, split="test") + ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True) return ds[:num_samples] diff --git a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py index 0b6438f86a0..9ed41944660 100644 --- a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py +++ b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py @@ -812,7 +812,9 @@ class UniSpeechSatRobustModelTest(ModelTesterMixin, unittest.TestCase): @slow class UniSpeechSatModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] @@ -821,7 +823,7 @@ class UniSpeechSatModelIntegrationTest(unittest.TestCase): return [x["array"] for x in speech_samples] def _load_superb(self, task, num_samples): - ds = load_dataset("anton-l/superb_dummy", task, split="test") + ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True) return ds[:num_samples] diff --git a/tests/models/univnet/test_feature_extraction_univnet.py b/tests/models/univnet/test_feature_extraction_univnet.py index dfa335d1538..673faaae9ad 100644 --- a/tests/models/univnet/test_feature_extraction_univnet.py +++ b/tests/models/univnet/test_feature_extraction_univnet.py @@ -327,7 +327,9 @@ class UnivNetFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest. self.assertTrue(pt_processed.input_features.dtype == torch.float32) def _load_datasamples(self, num_samples): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) ds = ds.cast_column("audio", Audio(sampling_rate=self.feat_extract_tester.sampling_rate)) # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/univnet/test_modeling_univnet.py b/tests/models/univnet/test_modeling_univnet.py index 1f579b9c91b..45a4caf6aed 100644 --- a/tests/models/univnet/test_modeling_univnet.py +++ b/tests/models/univnet/test_modeling_univnet.py @@ -220,7 +220,9 @@ class UnivNetModelIntegrationTests(unittest.TestCase): torch.cuda.empty_cache() def _load_datasamples(self, num_samples, sampling_rate=24000): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) ds = ds.cast_column("audio", Audio(sampling_rate=sampling_rate)) # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/vilt/test_modeling_vilt.py b/tests/models/vilt/test_modeling_vilt.py index 194b4dd7ba3..4b6cb0bb596 100644 --- a/tests/models/vilt/test_modeling_vilt.py +++ b/tests/models/vilt/test_modeling_vilt.py @@ -637,7 +637,7 @@ class ViltModelIntegrationTest(unittest.TestCase): processor = self.default_processor - dataset = load_dataset("hf-internal-testing/fixtures_nlvr2", split="test") + dataset = load_dataset("hf-internal-testing/fixtures_nlvr2", split="test", trust_remote_code=True) image1 = Image.open(dataset[0]["file"]).convert("RGB") image2 = Image.open(dataset[1]["file"]).convert("RGB") diff --git a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py index 3239b507a81..963860725b7 100644 --- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py +++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py @@ -815,7 +815,7 @@ class TrOCRModelIntegrationTest(unittest.TestCase): def test_inference_handwritten(self): model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten").to(torch_device) - dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test") + dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test", trust_remote_code=True) image = Image.open(dataset[0]["file"]).convert("RGB") processor = self.default_processor @@ -840,7 +840,7 @@ class TrOCRModelIntegrationTest(unittest.TestCase): def test_inference_printed(self): model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed").to(torch_device) - dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test") + dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test", trust_remote_code=True) image = Image.open(dataset[1]["file"]).convert("RGB") processor = self.default_processor diff --git a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py index 4cff7dca41c..18252a17524 100644 --- a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py @@ -72,7 +72,7 @@ def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout): try: _ = in_queue.get(timeout=timeout) - ds = load_dataset("common_voice", "es", split="test", streaming=True) + ds = load_dataset("legacy-datasets/common_voice", "es", split="test", streaming=True, trust_remote_code=True) sample = next(iter(ds)) resampled_audio = librosa.resample(sample["audio"]["array"], 48_000, 16_000) @@ -489,7 +489,9 @@ class FlaxWav2Vec2UtilsTest(unittest.TestCase): @slow class FlaxWav2Vec2ModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] @@ -585,7 +587,7 @@ class FlaxWav2Vec2ModelIntegrationTest(unittest.TestCase): @require_pyctcdecode @require_librosa def test_wav2vec2_with_lm(self): - ds = load_dataset("common_voice", "es", split="test", streaming=True) + ds = load_dataset("legacy-datasets/common_voice", "es", split="test", streaming=True, trust_remote_code=True) sample = next(iter(ds)) resampled_audio = librosa.resample(sample["audio"]["array"], 48_000, 16_000) @@ -604,7 +606,7 @@ class FlaxWav2Vec2ModelIntegrationTest(unittest.TestCase): @require_pyctcdecode @require_librosa def test_wav2vec2_with_lm_pool(self): - ds = load_dataset("common_voice", "es", split="test", streaming=True) + ds = load_dataset("legacy-datasets/common_voice", "es", split="test", streaming=True, trust_remote_code=True) sample = next(iter(ds)) resampled_audio = librosa.resample(sample["audio"]["array"], 48_000, 16_000) diff --git a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py index 7af3cd34e8a..2f10e3378d7 100644 --- a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py @@ -716,7 +716,9 @@ class TFWav2Vec2ModelIntegrationTest(unittest.TestCase): gc.collect() def _load_datasamples(self, num_samples): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] @@ -725,7 +727,7 @@ class TFWav2Vec2ModelIntegrationTest(unittest.TestCase): return [x["array"] for x in speech_samples] def _load_superb(self, task, num_samples): - ds = load_dataset("anton-l/superb_dummy", task, split="test") + ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True) return ds[:num_samples] diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py index ad70a69811e..9db852f014e 100644 --- a/tests/models/wav2vec2/test_modeling_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py @@ -101,7 +101,9 @@ def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout): try: _ = in_queue.get(timeout=timeout) - ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True) + ds = load_dataset( + "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True + ) sample = next(iter(ds)) resampled_audio = torchaudio.functional.resample( @@ -1468,7 +1470,9 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase): backend_empty_cache(torch_device) def _load_datasamples(self, num_samples): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] @@ -1477,7 +1481,7 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase): return [x["array"] for x in speech_samples] def _load_superb(self, task, num_samples): - ds = load_dataset("anton-l/superb_dummy", task, split="test") + ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True) return ds[:num_samples] @@ -1843,7 +1847,9 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase): @require_pyctcdecode @require_torchaudio def test_wav2vec2_with_lm(self): - ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True) + ds = load_dataset( + "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True + ) sample = next(iter(ds)) resampled_audio = torchaudio.functional.resample( @@ -1867,7 +1873,9 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase): @require_pyctcdecode @require_torchaudio def test_wav2vec2_with_lm_pool(self): - ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True) + ds = load_dataset( + "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True + ) sample = next(iter(ds)) resampled_audio = torchaudio.functional.resample( @@ -1965,7 +1973,9 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase): LANG_MAP = {"it": "ita", "es": "spa", "fr": "fra", "en": "eng"} def run_model(lang): - ds = load_dataset("mozilla-foundation/common_voice_11_0", lang, split="test", streaming=True) + ds = load_dataset( + "mozilla-foundation/common_voice_11_0", lang, split="test", streaming=True, trust_remote_code=True + ) sample = next(iter(ds)) wav2vec2_lang = LANG_MAP[lang] diff --git a/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py b/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py index 80237fea9d1..0fbd000edc8 100644 --- a/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py +++ b/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py @@ -855,7 +855,9 @@ class Wav2Vec2BertUtilsTest(unittest.TestCase): @slow class Wav2Vec2BertModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").filter(lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]) speech_samples = speech_samples[:num_samples]["audio"] diff --git a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py index dd8a4570d32..f119cd3dc2a 100644 --- a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py +++ b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py @@ -866,7 +866,9 @@ class Wav2Vec2ConformerUtilsTest(unittest.TestCase): @slow class Wav2Vec2ConformerModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").filter(lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]) speech_samples = speech_samples[:num_samples]["audio"] diff --git a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py index 61dee30091d..f0320583bf9 100644 --- a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py +++ b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py @@ -463,7 +463,9 @@ class Wav2Vec2ProcessorWithLMTest(unittest.TestCase): def test_word_time_stamp_integration(self): import torch - ds = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True) + ds = load_dataset( + "mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True, trust_remote_code=True + ) ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000)) ds_iter = iter(ds) sample = next(ds_iter) diff --git a/tests/models/wavlm/test_modeling_wavlm.py b/tests/models/wavlm/test_modeling_wavlm.py index 7305898eace..1ec5476e879 100644 --- a/tests/models/wavlm/test_modeling_wavlm.py +++ b/tests/models/wavlm/test_modeling_wavlm.py @@ -494,7 +494,9 @@ class WavLMModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): @slow class WavLMModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] @@ -503,7 +505,7 @@ class WavLMModelIntegrationTest(unittest.TestCase): return [x["array"] for x in speech_samples] def _load_superb(self, task, num_samples): - ds = load_dataset("anton-l/superb_dummy", task, split="test") + ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True) return ds[:num_samples] diff --git a/tests/models/whisper/test_feature_extraction_whisper.py b/tests/models/whisper/test_feature_extraction_whisper.py index a8295542f4e..579c42519ae 100644 --- a/tests/models/whisper/test_feature_extraction_whisper.py +++ b/tests/models/whisper/test_feature_extraction_whisper.py @@ -215,7 +215,9 @@ class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest. self.assertTrue(pt_processed.input_features.dtype == torch.float32) def _load_datasamples(self, num_samples): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/whisper/test_modeling_flax_whisper.py b/tests/models/whisper/test_modeling_flax_whisper.py index d0616637692..d5e18d22c2f 100644 --- a/tests/models/whisper/test_modeling_flax_whisper.py +++ b/tests/models/whisper/test_modeling_flax_whisper.py @@ -410,7 +410,9 @@ class FlaxWhisperModelIntegrationTest(unittest.TestCase): return WhisperProcessor.from_pretrained("openai/whisper-base") def _load_datasamples(self, num_samples): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] @@ -561,7 +563,7 @@ class FlaxWhisperModelIntegrationTest(unittest.TestCase): processor = WhisperProcessor.from_pretrained("openai/whisper-large") model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-large", from_pt=True) - ds = load_dataset("common_voice", "ja", split="test", streaming=True) + ds = load_dataset("legacy-datasets/common_voice", "ja", split="test", streaming=True, trust_remote_code=True) ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000)) input_speech = next(iter(ds))["audio"]["array"] input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="np") diff --git a/tests/models/whisper/test_modeling_tf_whisper.py b/tests/models/whisper/test_modeling_tf_whisper.py index e2df1b13976..97143cc4df5 100644 --- a/tests/models/whisper/test_modeling_tf_whisper.py +++ b/tests/models/whisper/test_modeling_tf_whisper.py @@ -704,7 +704,7 @@ class TFWhisperModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestC def _load_datasamples(num_samples): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] @@ -795,7 +795,7 @@ def _test_large_generation_multilingual(in_queue, out_queue, timeout): processor = WhisperProcessor.from_pretrained("openai/whisper-large") model = TFWhisperForConditionalGeneration.from_pretrained("openai/whisper-large") - ds = load_dataset("common_voice", "ja", split="test", streaming=True) + ds = load_dataset("legacy-datasets/common_voice", "ja", split="test", streaming=True, trust_remote_code=True) ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000)) input_speech = next(iter(ds))["audio"]["array"] input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="tf").input_features diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index a0c5afa4d6f..b749da805a5 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -1552,7 +1552,9 @@ class WhisperModelIntegrationTests(unittest.TestCase): return WhisperProcessor.from_pretrained("openai/whisper-base") def _load_datasamples(self, num_samples): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] @@ -1763,7 +1765,9 @@ class WhisperModelIntegrationTests(unittest.TestCase): model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large") model.to(torch_device) - ds = load_dataset("facebook/multilingual_librispeech", "german", split="test", streaming=True) + ds = load_dataset( + "facebook/multilingual_librispeech", "german", split="test", streaming=True, trust_remote_code=True + ) ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000)) input_speech = next(iter(ds))["audio"]["array"] @@ -1830,7 +1834,14 @@ class WhisperModelIntegrationTests(unittest.TestCase): model.to(torch_device) token = os.getenv("HF_HUB_READ_TOKEN", True) - ds = load_dataset("mozilla-foundation/common_voice_6_1", "ja", split="test", streaming=True, token=token) + ds = load_dataset( + "mozilla-foundation/common_voice_6_1", + "ja", + split="test", + streaming=True, + token=token, + trust_remote_code=True, + ) ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000)) input_speech = next(iter(ds))["audio"]["array"] @@ -2358,7 +2369,9 @@ class WhisperModelIntegrationTests(unittest.TestCase): ) assistant_model.to(torch_device) - dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + dataset = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) sample = dataset[0]["audio"] input_features = processor(sample["array"], return_tensors="pt", sampling_rate=16_000).input_features @@ -2407,7 +2420,9 @@ class WhisperModelIntegrationTests(unittest.TestCase): ) assistant_model.to(torch_device) - dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + dataset = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) sample = dataset[0]["audio"] input_features = processor(sample["array"], return_tensors="pt", sampling_rate=16_000).input_features @@ -2448,7 +2463,7 @@ class WhisperModelIntegrationTests(unittest.TestCase): model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") model = model.to(torch_device) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True) one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32) input_features = processor( @@ -2484,7 +2499,9 @@ class WhisperModelIntegrationTests(unittest.TestCase): prompt = "Mr. Kilter, Brionno." # let's force Quilter -> Kilter, Brion -> Brionno prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt").to(torch_device) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:-1]") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:-1]", trust_remote_code=True + ) one_audio = np.concatenate([x["array"] for x in ds["audio"]], dtype=np.float32) first_text = ds[0]["text"].lower() @@ -2535,7 +2552,7 @@ class WhisperModelIntegrationTests(unittest.TestCase): model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") model = model.to(torch_device) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True) one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32) input_features = processor( @@ -2568,7 +2585,7 @@ class WhisperModelIntegrationTests(unittest.TestCase): model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") model = model.to(torch_device) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True) one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32) input_features = processor( @@ -2610,7 +2627,7 @@ class WhisperModelIntegrationTests(unittest.TestCase): model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") model = model.to(torch_device) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True) one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32) audios = [] audios.append(one_audio[110000:]) @@ -2664,7 +2681,7 @@ class WhisperModelIntegrationTests(unittest.TestCase): model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") model = model.to(torch_device) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True) one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32) audios = [] audios.append(one_audio[110000:]) diff --git a/tests/pipelines/test_pipelines_audio_classification.py b/tests/pipelines/test_pipelines_audio_classification.py index 48c39ff663f..57040a468be 100644 --- a/tests/pipelines/test_pipelines_audio_classification.py +++ b/tests/pipelines/test_pipelines_audio_classification.py @@ -69,7 +69,9 @@ class AudioClassificationPipelineTests(unittest.TestCase): import datasets # test with a local file - dataset = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + dataset = datasets.load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) audio = dataset[0]["audio"]["array"] output = audio_classifier(audio) self.assertEqual( @@ -115,7 +117,7 @@ class AudioClassificationPipelineTests(unittest.TestCase): model = "superb/wav2vec2-base-superb-ks" audio_classifier = pipeline("audio-classification", model=model) - dataset = datasets.load_dataset("anton-l/superb_dummy", "ks", split="test") + dataset = datasets.load_dataset("anton-l/superb_dummy", "ks", split="test", trust_remote_code=True) audio = np.array(dataset[3]["speech"], dtype=np.float32) output = audio_classifier(audio, top_k=4) diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py index 430666990fe..73376ff2189 100644 --- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py +++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py @@ -206,7 +206,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): @require_torch @require_pyctcdecode def test_large_model_pt_with_lm(self): - dataset = load_dataset("Narsil/asr_dummy", streaming=True) + dataset = load_dataset("Narsil/asr_dummy", streaming=True, trust_remote_code=True) third_item = next(iter(dataset["test"].skip(3))) filename = third_item["file"] @@ -296,7 +296,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): output = speech_recognizer(waveform) self.assertEqual(output, {"text": ""}) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ).sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"}) @@ -313,7 +315,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): output = speech_recognizer(waveform) self.assertEqual(output, {"text": ""}) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ).sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": "a man said to the universe sir i exist"}) @@ -328,7 +332,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): chunk_length_s=8, stride_length_s=1, ) - data = load_dataset("librispeech_asr", "clean", split="test", streaming=True) + data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True) sample = next(iter(data)) pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language="en", task="transcribe") @@ -371,7 +375,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): task="automatic-speech-recognition", model="openai/whisper-tiny.en", ) - data = load_dataset("librispeech_asr", "clean", split="test", streaming=True) + data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True) samples = [next(iter(data)) for _ in range(8)] audio = np.concatenate([sample["audio"]["array"] for sample in samples]) @@ -488,7 +492,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): model="openai/whisper-tiny", framework="pt", ) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ).sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": " A man said to the universe, Sir, I exist."}) @@ -663,7 +669,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): @slow @require_torch def test_whisper_timestamp_prediction(self): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ).sort("id") array = np.concatenate( [ds[40]["audio"]["array"], ds[41]["audio"]["array"], ds[42]["audio"]["array"], ds[43]["audio"]["array"]] ) @@ -761,7 +769,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): @slow @require_torch def test_whisper_large_timestamp_prediction(self): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ).sort("id") array = np.concatenate( [ds[40]["audio"]["array"], ds[41]["audio"]["array"], ds[42]["audio"]["array"], ds[43]["audio"]["array"]] ) @@ -855,7 +865,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): chunk_length_s=3, return_timestamps="word", ) - data = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + data = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) sample = data[0]["audio"] # not the same output as test_simple_whisper_asr because of chunking @@ -898,7 +910,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): model="openai/whisper-large-v3", return_timestamps="word", ) - data = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + data = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) sample = data[0]["audio"] # not the same output as test_simple_whisper_asr because of chunking @@ -943,7 +957,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): framework="pt", ) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ).sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": 'Ein Mann sagte zum Universum : " Sir, ich existiert! "'}) @@ -961,7 +977,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): output = asr(waveform) self.assertEqual(output, {"text": ""}) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ).sort("id") filename = ds[40]["file"] output = asr(filename) self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"}) @@ -987,7 +1005,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): output = asr(waveform) self.assertEqual(output, {"text": "(Applausi)"}) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ).sort("id") filename = ds[40]["file"] output = asr(filename) self.assertEqual(output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."}) @@ -1007,7 +1027,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): model="openai/whisper-tiny.en", framework="pt", ) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) filename = ds[0]["file"] output = speech_recognizer(filename) self.assertEqual( @@ -1076,7 +1098,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): model="openai/whisper-large", framework="pt", ) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ).sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": " A man said to the universe, Sir, I exist."}) @@ -1111,7 +1135,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): model="openai/whisper-tiny.en", framework="pt", ) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) filename = ds[0]["file"] # 1. English-only model compatible with no language argument @@ -1144,7 +1170,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): @slow def test_speculative_decoding_whisper_non_distil(self): # Load data: - dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]") + dataset = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]", trust_remote_code=True + ) sample = dataset[0]["audio"] # Load model: @@ -1188,7 +1216,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): @slow def test_speculative_decoding_whisper_distil(self): # Load data: - dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]") + dataset = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]", trust_remote_code=True + ) sample = dataset[0]["audio"] # Load model: @@ -1240,7 +1270,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): framework="pt", ) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ).sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": "A man said to the universe: “Sir, I exist."}) @@ -1256,7 +1288,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): framework="pt", ) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ).sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": "Ein Mann sagte zu dem Universum, Sir, ich bin da."}) @@ -1273,7 +1307,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): framework="pt", ) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ).sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) @@ -1290,7 +1326,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): framework="pt", ) - dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + dataset = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) sample = dataset[0]["audio"] output = speech_recognizer(sample) @@ -1307,7 +1345,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): chunk_length_s=10.0, ) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ).sort("id") audio = ds[40]["audio"]["array"] n_repeats = 2 @@ -1323,7 +1363,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): model="hf-internal-testing/tiny-random-wav2vec2", ) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ).sort("id") # Take short audio to keep the test readable audio = ds[40]["audio"]["array"][:800] @@ -1367,7 +1409,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): chunk_length_s=10.0, ) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ).sort("id") audio = ds[40]["audio"]["array"] n_repeats = 2 @@ -1395,7 +1439,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): ) self.assertEqual(speech_recognizer.type, "ctc_with_lm") - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ).sort("id") audio = ds[40]["audio"]["array"] n_repeats = 2 @@ -1423,7 +1469,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): ) self.assertEqual(speech_recognizer.type, "ctc_with_lm") - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ).sort("id") audio = ds[40]["audio"]["array"] n_repeats = 2 @@ -1507,7 +1555,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): device=torch_device, ) - dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + dataset = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) sample = dataset[0]["audio"] result = pipe(sample, generate_kwargs={"tgt_lang": "eng"}) @@ -1530,7 +1580,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): chunk_length_s=10.0, ) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ).sort("id") audio = ds[40]["audio"]["array"] n_repeats = 10 @@ -1642,7 +1694,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): model="patrickvonplaten/wav2vec2-base-100h-with-lm", chunk_length_s=10.0, ) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ).sort("id") audio = ds[40]["audio"]["array"] n_repeats = 10 diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index 6ede7d1c7ac..2ef3e6e5ff6 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -840,7 +840,9 @@ class CustomPipelineTest(unittest.TestCase): def test_chunk_pipeline_batching_single_file(self): # Make sure we have cached the pipeline. pipe = pipeline(model="hf-internal-testing/tiny-random-Wav2Vec2ForCTC") - ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") + ds = datasets.load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ).sort("id") audio = ds[40]["audio"]["array"] pipe = pipeline(model="hf-internal-testing/tiny-random-Wav2Vec2ForCTC") diff --git a/tests/pipelines/test_pipelines_image_segmentation.py b/tests/pipelines/test_pipelines_image_segmentation.py index 23a95570abd..6546df2a1b9 100644 --- a/tests/pipelines/test_pipelines_image_segmentation.py +++ b/tests/pipelines/test_pipelines_image_segmentation.py @@ -567,7 +567,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase): image_segmenter = pipeline("image-segmentation", model=model, image_processor=image_processor) - image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test") + image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True) file = image[0]["file"] outputs = image_segmenter(file, threshold=threshold) @@ -621,7 +621,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase): def test_oneformer(self): image_segmenter = pipeline(model="shi-labs/oneformer_ade20k_swin_tiny") - image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test") + image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True) file = image[0]["file"] outputs = image_segmenter(file, threshold=0.99) # Shortening by hashing diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py index b181bb89658..679754db38c 100644 --- a/tests/quantization/ggml/test_ggml.py +++ b/tests/quantization/ggml/test_ggml.py @@ -178,7 +178,7 @@ class GgufIntegrationTests(unittest.TestCase): gguf_tokenizer = AutoTokenizer.from_pretrained(self.model_id, gguf_file=self.q8_0_gguf_model_id) original_tokenizer = AutoTokenizer.from_pretrained(self.original_model_id) - dataset = load_dataset("code_x_glue_ct_code_to_text", "go") + dataset = load_dataset("google/code_x_glue_ct_code_to_text", "go") for item in tqdm.tqdm(dataset["validation"]): string = item["code"] encoded1 = gguf_tokenizer.encode(string) @@ -191,7 +191,7 @@ class GgufIntegrationTests(unittest.TestCase): self.assertEqual(decoded1, decoded2) - dataset = load_dataset("xnli", "all_languages") + dataset = load_dataset("facebook/xnli", "all_languages") for i, item in enumerate(tqdm.tqdm(dataset["train"].select(range(100)))): for string in item["premise"].values(): diff --git a/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py b/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py index 126659f0bcb..4c1b1b991ea 100644 --- a/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py +++ b/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py @@ -253,7 +253,7 @@ def main(): # download the dataset. if data_args.task_name is not None: # Downloading and loading a dataset from the hub. - datasets = load_dataset("glue", data_args.task_name) + datasets = load_dataset("nyu-mll/glue", data_args.task_name) else: # Loading a dataset from your local files. # CSV/JSON training and evaluation files are needed. diff --git a/tests/sagemaker/scripts/tensorflow/run_tf.py b/tests/sagemaker/scripts/tensorflow/run_tf.py index 315fcca8980..a5b8e3fe1f0 100644 --- a/tests/sagemaker/scripts/tensorflow/run_tf.py +++ b/tests/sagemaker/scripts/tensorflow/run_tf.py @@ -56,7 +56,7 @@ if __name__ == "__main__": tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) # Load dataset - train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"]) + train_dataset, test_dataset = load_dataset("stanfordnlp/imdb", split=["train", "test"]) train_dataset = train_dataset.shuffle().select(range(5000)) # smaller the size for train dataset to 5k test_dataset = test_dataset.shuffle().select(range(500)) # smaller the size for test dataset to 500 diff --git a/tests/sagemaker/scripts/tensorflow/run_tf_dist.py b/tests/sagemaker/scripts/tensorflow/run_tf_dist.py index 324715e12fc..87e96268261 100644 --- a/tests/sagemaker/scripts/tensorflow/run_tf_dist.py +++ b/tests/sagemaker/scripts/tensorflow/run_tf_dist.py @@ -50,7 +50,7 @@ def fit(model, loss, opt, train_dataset, epochs, train_batch_size, max_steps=Non def get_datasets(tokenizer, train_batch_size, eval_batch_size): # Load dataset - train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"]) + train_dataset, test_dataset = load_dataset("stanfordnlp/imdb", split=["train", "test"]) # Preprocess train dataset train_dataset = train_dataset.map( diff --git a/tests/trainer/test_trainer_seq2seq.py b/tests/trainer/test_trainer_seq2seq.py index 17593e2e73b..a4b38aecb2a 100644 --- a/tests/trainer/test_trainer_seq2seq.py +++ b/tests/trainer/test_trainer_seq2seq.py @@ -43,8 +43,8 @@ class Seq2seqTrainerTester(TestCasePlus): bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id bert2bert.config.max_length = 128 - train_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]") - val_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="validation[:1%]") + train_dataset = datasets.load_dataset("abisee/cnn_dailymail", "3.0.0", split="train[:1%]") + val_dataset = datasets.load_dataset("abisee/cnn_dailymail", "3.0.0", split="validation[:1%]") train_dataset = train_dataset.select(range(32)) val_dataset = val_dataset.select(range(16)) @@ -145,7 +145,7 @@ class Seq2seqTrainerTester(TestCasePlus): MAX_INPUT_LENGTH = 256 MAX_TARGET_LENGTH = 256 - dataset = datasets.load_dataset("gsm8k", "main", split="train[:38]") + dataset = datasets.load_dataset("openai/gsm8k", "main", split="train[:38]") model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small") tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small") data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt", padding="longest") diff --git a/tests/utils/test_audio_utils.py b/tests/utils/test_audio_utils.py index d6471813076..ff1a9877929 100644 --- a/tests/utils/test_audio_utils.py +++ b/tests/utils/test_audio_utils.py @@ -259,7 +259,9 @@ class AudioUtilsFunctionTester(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] return [x["array"] for x in speech_samples] diff --git a/utils/create_dummy_models.py b/utils/create_dummy_models.py index 3a2c39de4c8..daa330839ca 100644 --- a/utils/create_dummy_models.py +++ b/utils/create_dummy_models.py @@ -1080,7 +1080,7 @@ def build(config_class, models_to_create, output_dir): it. Models in different frameworks with the same architecture will be saved in the same subdirectory. """ if data["training_ds"] is None or data["testing_ds"] is None: - ds = load_dataset("wikitext", "wikitext-2-raw-v1") + ds = load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1") data["training_ds"] = ds["train"] data["testing_ds"] = ds["test"]