diff --git a/tests/models/albert/test_tokenization_albert.py b/tests/models/albert/test_tokenization_albert.py index 343cba168f2..e3f39257a68 100644 --- a/tests/models/albert/test_tokenization_albert.py +++ b/tests/models/albert/test_tokenization_albert.py @@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model") @require_sentencepiece @require_tokenizers class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "albert/albert-base-v1" tokenizer_class = AlbertTokenizer rust_tokenizer_class = AlbertTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/bart/test_tokenization_bart.py b/tests/models/bart/test_tokenization_bart.py index 746716161ac..f3a63d6d417 100644 --- a/tests/models/bart/test_tokenization_bart.py +++ b/tests/models/bart/test_tokenization_bart.py @@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_det @require_tokenizers class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/bart-base" tokenizer_class = BartTokenizer rust_tokenizer_class = BartTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/barthez/test_tokenization_barthez.py b/tests/models/barthez/test_tokenization_barthez.py index 7759d3560de..b2b0c7b058d 100644 --- a/tests/models/barthez/test_tokenization_barthez.py +++ b/tests/models/barthez/test_tokenization_barthez.py @@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_sentencepiece @slow # see https://github.com/huggingface/transformers/issues/11457 class BarthezTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "moussaKam/mbarthez" tokenizer_class = BarthezTokenizer rust_tokenizer_class = BarthezTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/bartpho/test_tokenization_bartpho.py b/tests/models/bartpho/test_tokenization_bartpho.py index 1fc06e38e04..023584e91f8 100644 --- a/tests/models/bartpho/test_tokenization_bartpho.py +++ b/tests/models/bartpho/test_tokenization_bartpho.py @@ -26,6 +26,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe.model") class BartphoTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "vinai/bartpho-syllable" tokenizer_class = BartphoTokenizer test_rust_tokenizer = False test_sentencepiece = True diff --git a/tests/models/bert/test_tokenization_bert.py b/tests/models/bert/test_tokenization_bert.py index bee1ccf0d15..cf3cc1dce10 100644 --- a/tests/models/bert/test_tokenization_bert.py +++ b/tests/models/bert/test_tokenization_bert.py @@ -34,6 +34,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english @require_tokenizers class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google-bert/bert-base-uncased" tokenizer_class = BertTokenizer rust_tokenizer_class = BertTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/bert_generation/test_tokenization_bert_generation.py b/tests/models/bert_generation/test_tokenization_bert_generation.py index 41d99288351..e1ccfba8f4e 100644 --- a/tests/models/bert_generation/test_tokenization_bert_generation.py +++ b/tests/models/bert_generation/test_tokenization_bert_generation.py @@ -29,6 +29,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_sentencepiece class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/bert_for_seq_generation_L-24_bbc_encoder" tokenizer_class = BertGenerationTokenizer test_rust_tokenizer = False test_sentencepiece = True diff --git a/tests/models/bert_japanese/test_tokenization_bert_japanese.py b/tests/models/bert_japanese/test_tokenization_bert_japanese.py index d2a7accb390..d4954c96522 100644 --- a/tests/models/bert_japanese/test_tokenization_bert_japanese.py +++ b/tests/models/bert_japanese/test_tokenization_bert_japanese.py @@ -36,6 +36,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @custom_tokenizers class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "cl-tohoku/bert-base-japanese" tokenizer_class = BertJapaneseTokenizer test_rust_tokenizer = False space_between_special_tokens = True @@ -403,6 +404,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): @custom_tokenizers class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "cl-tohoku/bert-base-japanese" tokenizer_class = BertJapaneseTokenizer test_rust_tokenizer = False diff --git a/tests/models/bertweet/test_tokenization_bertweet.py b/tests/models/bertweet/test_tokenization_bertweet.py index 2a4c643269c..71e0a0afe5b 100644 --- a/tests/models/bertweet/test_tokenization_bertweet.py +++ b/tests/models/bertweet/test_tokenization_bertweet.py @@ -22,6 +22,7 @@ from ...test_tokenization_common import TokenizerTesterMixin class BertweetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "vinai/bertweet-base" tokenizer_class = BertweetTokenizer test_rust_tokenizer = False diff --git a/tests/models/big_bird/test_tokenization_big_bird.py b/tests/models/big_bird/test_tokenization_big_bird.py index 23b25e40294..863d30e8499 100644 --- a/tests/models/big_bird/test_tokenization_big_bird.py +++ b/tests/models/big_bird/test_tokenization_big_bird.py @@ -30,6 +30,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_sentencepiece @require_tokenizers class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/bigbird-roberta-base" tokenizer_class = BigBirdTokenizer rust_tokenizer_class = BigBirdTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/biogpt/test_tokenization_biogpt.py b/tests/models/biogpt/test_tokenization_biogpt.py index c350f5de0ea..ea52a7cf7f3 100644 --- a/tests/models/biogpt/test_tokenization_biogpt.py +++ b/tests/models/biogpt/test_tokenization_biogpt.py @@ -26,6 +26,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_sacremoses class BioGptTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/biogpt" tokenizer_class = BioGptTokenizer test_rust_tokenizer = False diff --git a/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py b/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py index b022e77682b..369dde6739a 100644 --- a/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py +++ b/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py @@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/blenderbot_small-90M" tokenizer_class = BlenderbotSmallTokenizer test_rust_tokenizer = False diff --git a/tests/models/bloom/test_tokenization_bloom.py b/tests/models/bloom/test_tokenization_bloom.py index 02491929d14..4fbfcb8923e 100644 --- a/tests/models/bloom/test_tokenization_bloom.py +++ b/tests/models/bloom/test_tokenization_bloom.py @@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "bigscience/tokenizer" slow_tokenizer_class = None rust_tokenizer_class = BloomTokenizerFast tokenizer_class = BloomTokenizerFast diff --git a/tests/models/camembert/test_tokenization_camembert.py b/tests/models/camembert/test_tokenization_camembert.py index 33254b96de8..624338b7f0b 100644 --- a/tests/models/camembert/test_tokenization_camembert.py +++ b/tests/models/camembert/test_tokenization_camembert.py @@ -32,6 +32,7 @@ FRAMEWORK = "pt" if is_torch_available() else "tf" @require_sentencepiece @require_tokenizers class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "almanach/camembert-base" tokenizer_class = CamembertTokenizer rust_tokenizer_class = CamembertTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/canine/test_tokenization_canine.py b/tests/models/canine/test_tokenization_canine.py index da4665e4cf0..eb3e6d9b4af 100644 --- a/tests/models/canine/test_tokenization_canine.py +++ b/tests/models/canine/test_tokenization_canine.py @@ -28,6 +28,7 @@ from ...test_tokenization_common import TokenizerTesterMixin class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "nielsr/canine-s" tokenizer_class = CanineTokenizer test_rust_tokenizer = False diff --git a/tests/models/clip/test_tokenization_clip.py b/tests/models/clip/test_tokenization_clip.py index 4f1d9a73ef0..ec1cbd08ac5 100644 --- a/tests/models/clip/test_tokenization_clip.py +++ b/tests/models/clip/test_tokenization_clip.py @@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "openai/clip-vit-base-patch32" tokenizer_class = CLIPTokenizer rust_tokenizer_class = CLIPTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/clvp/test_tokenization_clvp.py b/tests/models/clvp/test_tokenization_clvp.py index b6368887595..7bb522f4144 100644 --- a/tests/models/clvp/test_tokenization_clvp.py +++ b/tests/models/clvp/test_tokenization_clvp.py @@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, slow class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "susnato/clvp_dev" tokenizer_class = ClvpTokenizer test_rust_tokenizer = False from_pretrained_kwargs = {"add_prefix_space": True} diff --git a/tests/models/code_llama/test_tokenization_code_llama.py b/tests/models/code_llama/test_tokenization_code_llama.py index d39454b0fac..2a71ded72a5 100644 --- a/tests/models/code_llama/test_tokenization_code_llama.py +++ b/tests/models/code_llama/test_tokenization_code_llama.py @@ -51,6 +51,7 @@ if is_torch_available(): @require_sentencepiece @require_tokenizers class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "hf-internal-testing/llama-code-tokenizer" tokenizer_class = CodeLlamaTokenizer rust_tokenizer_class = CodeLlamaTokenizerFast test_rust_tokenizer = False diff --git a/tests/models/codegen/test_tokenization_codegen.py b/tests/models/codegen/test_tokenization_codegen.py index edffbeaec9a..025ed99b9ac 100644 --- a/tests/models/codegen/test_tokenization_codegen.py +++ b/tests/models/codegen/test_tokenization_codegen.py @@ -28,6 +28,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "Salesforce/codegen-350M-mono" tokenizer_class = CodeGenTokenizer rust_tokenizer_class = CodeGenTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/cpmant/test_tokenization_cpmant.py b/tests/models/cpmant/test_tokenization_cpmant.py index f5d0ef32450..042473065be 100644 --- a/tests/models/cpmant/test_tokenization_cpmant.py +++ b/tests/models/cpmant/test_tokenization_cpmant.py @@ -24,6 +24,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_jieba class CPMAntTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "openbmb/cpm-ant-10b" tokenizer_class = CpmAntTokenizer test_rust_tokenizer = False diff --git a/tests/models/ctrl/test_tokenization_ctrl.py b/tests/models/ctrl/test_tokenization_ctrl.py index 02c3459f9e0..7fe61f36074 100644 --- a/tests/models/ctrl/test_tokenization_ctrl.py +++ b/tests/models/ctrl/test_tokenization_ctrl.py @@ -23,6 +23,7 @@ from ...test_tokenization_common import TokenizerTesterMixin class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "Salesforce/ctrl" tokenizer_class = CTRLTokenizer test_rust_tokenizer = False test_seq2seq = False diff --git a/tests/models/deberta/test_tokenization_deberta.py b/tests/models/deberta/test_tokenization_deberta.py index 81d7bd95bd8..96248cf2ec1 100644 --- a/tests/models/deberta/test_tokenization_deberta.py +++ b/tests/models/deberta/test_tokenization_deberta.py @@ -26,6 +26,7 @@ from ...test_tokenization_common import TokenizerTesterMixin class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/deberta-base" tokenizer_class = DebertaTokenizer test_rust_tokenizer = True rust_tokenizer_class = DebertaTokenizerFast diff --git a/tests/models/deberta_v2/test_tokenization_deberta_v2.py b/tests/models/deberta_v2/test_tokenization_deberta_v2.py index c75f45bfe8d..55f7e8b5429 100644 --- a/tests/models/deberta_v2/test_tokenization_deberta_v2.py +++ b/tests/models/deberta_v2/test_tokenization_deberta_v2.py @@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model") @require_sentencepiece @require_tokenizers class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/deberta-v2-xlarge" tokenizer_class = DebertaV2Tokenizer rust_tokenizer_class = DebertaV2TokenizerFast test_sentencepiece = True diff --git a/tests/models/distilbert/test_tokenization_distilbert.py b/tests/models/distilbert/test_tokenization_distilbert.py index 09422395720..c61393f6a6a 100644 --- a/tests/models/distilbert/test_tokenization_distilbert.py +++ b/tests/models/distilbert/test_tokenization_distilbert.py @@ -25,6 +25,7 @@ class DistilBertTokenizationTest(BertTokenizationTest): tokenizer_class = DistilBertTokenizer rust_tokenizer_class = DistilBertTokenizerFast test_rust_tokenizer = True + from_pretrained_id = "distilbert/distilbert-base-uncased" @slow def test_sequence_builders(self): diff --git a/tests/models/dpr/test_tokenization_dpr.py b/tests/models/dpr/test_tokenization_dpr.py index 2e0f41da4d5..1fd3d8bdb9e 100644 --- a/tests/models/dpr/test_tokenization_dpr.py +++ b/tests/models/dpr/test_tokenization_dpr.py @@ -33,6 +33,7 @@ class DPRContextEncoderTokenizationTest(BertTokenizationTest): tokenizer_class = DPRContextEncoderTokenizer rust_tokenizer_class = DPRContextEncoderTokenizerFast test_rust_tokenizer = True + from_pretrained_id = "facebook/dpr-ctx_encoder-single-nq-base" @require_tokenizers @@ -40,6 +41,7 @@ class DPRQuestionEncoderTokenizationTest(BertTokenizationTest): tokenizer_class = DPRQuestionEncoderTokenizer rust_tokenizer_class = DPRQuestionEncoderTokenizerFast test_rust_tokenizer = True + from_pretrained_id = "facebook/dpr-ctx_encoder-single-nq-base" @require_tokenizers @@ -47,6 +49,7 @@ class DPRReaderTokenizationTest(BertTokenizationTest): tokenizer_class = DPRReaderTokenizer rust_tokenizer_class = DPRReaderTokenizerFast test_rust_tokenizer = True + from_pretrained_id = "facebook/dpr-ctx_encoder-single-nq-base" @slow def test_decode_best_spans(self): diff --git a/tests/models/electra/test_tokenization_electra.py b/tests/models/electra/test_tokenization_electra.py index 1c9b517f1f1..64611cb09c1 100644 --- a/tests/models/electra/test_tokenization_electra.py +++ b/tests/models/electra/test_tokenization_electra.py @@ -33,6 +33,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english @require_tokenizers class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/electra-small-generator" tokenizer_class = ElectraTokenizer rust_tokenizer_class = ElectraTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/ernie_m/test_tokenization_ernie_m.py b/tests/models/ernie_m/test_tokenization_ernie_m.py index 19f144df453..5cc5ec6991b 100644 --- a/tests/models/ernie_m/test_tokenization_ernie_m.py +++ b/tests/models/ernie_m/test_tokenization_ernie_m.py @@ -28,6 +28,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model") @require_sentencepiece @require_tokenizers class ErnieMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "susnato/ernie-m-base_pytorch" tokenizer_class = ErnieMTokenizer test_seq2seq = False test_sentencepiece = True diff --git a/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py b/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py index 0c0e2696170..119e35555a8 100644 --- a/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py +++ b/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py @@ -24,6 +24,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_g2p_en class FastSpeech2ConformerTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "espnet/fastspeech2_conformer" tokenizer_class = FastSpeech2ConformerTokenizer test_rust_tokenizer = False diff --git a/tests/models/fnet/test_tokenization_fnet.py b/tests/models/fnet/test_tokenization_fnet.py index 85080efc3e5..a3492cf966c 100644 --- a/tests/models/fnet/test_tokenization_fnet.py +++ b/tests/models/fnet/test_tokenization_fnet.py @@ -28,6 +28,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model") @require_sentencepiece @require_tokenizers class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/fnet-base" tokenizer_class = FNetTokenizer rust_tokenizer_class = FNetTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/fsmt/test_tokenization_fsmt.py b/tests/models/fsmt/test_tokenization_fsmt.py index 7407c2fbc86..4be15cbee13 100644 --- a/tests/models/fsmt/test_tokenization_fsmt.py +++ b/tests/models/fsmt/test_tokenization_fsmt.py @@ -30,6 +30,7 @@ FSMT_TINY2 = "stas/tiny-wmt19-en-ru" class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "stas/tiny-wmt19-en-de" tokenizer_class = FSMTTokenizer test_rust_tokenizer = False diff --git a/tests/models/funnel/test_tokenization_funnel.py b/tests/models/funnel/test_tokenization_funnel.py index 6c5eb87db17..7628582e9fc 100644 --- a/tests/models/funnel/test_tokenization_funnel.py +++ b/tests/models/funnel/test_tokenization_funnel.py @@ -26,6 +26,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "funnel-transformer/small" tokenizer_class = FunnelTokenizer rust_tokenizer_class = FunnelTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/gemma/test_tokenization_gemma.py b/tests/models/gemma/test_tokenization_gemma.py index a16d471a24b..5e485da491f 100644 --- a/tests/models/gemma/test_tokenization_gemma.py +++ b/tests/models/gemma/test_tokenization_gemma.py @@ -49,6 +49,7 @@ if is_torch_available(): @require_sentencepiece @require_tokenizers class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/gemma-7b" tokenizer_class = GemmaTokenizer rust_tokenizer_class = GemmaTokenizerFast diff --git a/tests/models/gpt2/test_tokenization_gpt2.py b/tests/models/gpt2/test_tokenization_gpt2.py index 78906e3db32..1e7c81e4be2 100644 --- a/tests/models/gpt2/test_tokenization_gpt2.py +++ b/tests/models/gpt2/test_tokenization_gpt2.py @@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "openai-community/gpt2" tokenizer_class = GPT2Tokenizer rust_tokenizer_class = GPT2TokenizerFast test_rust_tokenizer = True diff --git a/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py b/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py index 293116a24e3..ec505da4a00 100644 --- a/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py +++ b/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py @@ -29,6 +29,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class GPTNeoXJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "abeja/gpt-neox-japanese-2.7b" tokenizer_class = GPTNeoXJapaneseTokenizer test_rust_tokenizer = False from_pretrained_kwargs = {"do_clean_text": False, "add_prefix_space": False} diff --git a/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py b/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py index 7bbaf748828..ae9526342cb 100644 --- a/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py +++ b/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py @@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_with_bytefallback.mode @require_sentencepiece @require_tokenizers class GPTSw3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "AI-Sweden-Models/gpt-sw3-126m" tokenizer_class = GPTSw3Tokenizer test_rust_tokenizer = False test_sentencepiece = True diff --git a/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py b/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py index 6d656b2d0ff..8d989a51a73 100644 --- a/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py +++ b/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py @@ -29,6 +29,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class GPTSanJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "Tanrei/GPTSAN-japanese" tokenizer_class = GPTSanJapaneseTokenizer test_rust_tokenizer = False from_pretrained_kwargs = {"do_clean_text": False, "add_prefix_space": False} diff --git a/tests/models/herbert/test_tokenization_herbert.py b/tests/models/herbert/test_tokenization_herbert.py index d035348b739..b8bbd777581 100644 --- a/tests/models/herbert/test_tokenization_herbert.py +++ b/tests/models/herbert/test_tokenization_herbert.py @@ -28,6 +28,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_sacremoses @require_tokenizers class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "allegro/herbert-base-cased" tokenizer_class = HerbertTokenizer rust_tokenizer_class = HerbertTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/layoutlm/test_tokenization_layoutlm.py b/tests/models/layoutlm/test_tokenization_layoutlm.py index b73b2aa8e44..a34811a90b0 100644 --- a/tests/models/layoutlm/test_tokenization_layoutlm.py +++ b/tests/models/layoutlm/test_tokenization_layoutlm.py @@ -26,6 +26,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class LayoutLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/layoutlm-base-uncased" tokenizer_class = LayoutLMTokenizer rust_tokenizer_class = LayoutLMTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py index 3360933be67..61fafa23dac 100644 --- a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py @@ -61,6 +61,7 @@ logger = logging.get_logger(__name__) @require_tokenizers @require_pandas class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/layoutlmv2-base-uncased" tokenizer_class = LayoutLMv2Tokenizer rust_tokenizer_class = LayoutLMv2TokenizerFast test_rust_tokenizer = True diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py index db02dc65d65..c7af4fbddc7 100644 --- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py @@ -49,6 +49,7 @@ logger = logging.get_logger(__name__) @require_tokenizers @require_pandas class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/layoutlmv3-base" tokenizer_class = LayoutLMv3Tokenizer rust_tokenizer_class = LayoutLMv3TokenizerFast test_rust_tokenizer = True diff --git a/tests/models/layoutxlm/test_tokenization_layoutxlm.py b/tests/models/layoutxlm/test_tokenization_layoutxlm.py index 086bbc6ba0b..474cc531718 100644 --- a/tests/models/layoutxlm/test_tokenization_layoutxlm.py +++ b/tests/models/layoutxlm/test_tokenization_layoutxlm.py @@ -54,6 +54,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_tokenizers @require_pandas class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "FacebookAI/xlm-roberta-base" tokenizer_class = LayoutXLMTokenizer rust_tokenizer_class = LayoutXLMTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/led/test_tokenization_led.py b/tests/models/led/test_tokenization_led.py index 7ff81749946..f287677a129 100644 --- a/tests/models/led/test_tokenization_led.py +++ b/tests/models/led/test_tokenization_led.py @@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class TestTokenizationLED(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "allenai/led-base-16384" tokenizer_class = LEDTokenizer rust_tokenizer_class = LEDTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py index f3674a83b08..e99104f7f6d 100644 --- a/tests/models/llama/test_tokenization_llama.py +++ b/tests/models/llama/test_tokenization_llama.py @@ -52,6 +52,7 @@ if is_torch_available(): @require_sentencepiece @require_tokenizers class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "hf-internal-testing/llama-tokenizer" tokenizer_class = LlamaTokenizer rust_tokenizer_class = LlamaTokenizerFast diff --git a/tests/models/longformer/test_tokenization_longformer.py b/tests/models/longformer/test_tokenization_longformer.py index 42524ca65a6..1d1eda3380d 100644 --- a/tests/models/longformer/test_tokenization_longformer.py +++ b/tests/models/longformer/test_tokenization_longformer.py @@ -30,6 +30,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest with FacebookAI/roberta-base->allenai/longformer-base-4096,Roberta->Longformer,roberta->longformer, class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "allenai/longformer-base-4096" # Ignore copy tokenizer_class = LongformerTokenizer test_slow_tokenizer = True diff --git a/tests/models/luke/test_tokenization_luke.py b/tests/models/luke/test_tokenization_luke.py index 26797faf775..0e5d9123156 100644 --- a/tests/models/luke/test_tokenization_luke.py +++ b/tests/models/luke/test_tokenization_luke.py @@ -28,6 +28,7 @@ SAMPLE_ENTITY_VOCAB = get_tests_dir("fixtures/test_entity_vocab.json") class LukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "studio-ousia/luke-base" tokenizer_class = LukeTokenizer test_rust_tokenizer = False from_pretrained_kwargs = {"cls_token": ""} diff --git a/tests/models/lxmert/test_tokenization_lxmert.py b/tests/models/lxmert/test_tokenization_lxmert.py index e094427f761..716e3b971fa 100644 --- a/tests/models/lxmert/test_tokenization_lxmert.py +++ b/tests/models/lxmert/test_tokenization_lxmert.py @@ -26,6 +26,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "unc-nlp/lxmert-base-uncased" tokenizer_class = LxmertTokenizer rust_tokenizer_class = LxmertTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/m2m_100/test_tokenization_m2m_100.py b/tests/models/m2m_100/test_tokenization_m2m_100.py index 50087a7d9d2..ced6cf13dea 100644 --- a/tests/models/m2m_100/test_tokenization_m2m_100.py +++ b/tests/models/m2m_100/test_tokenization_m2m_100.py @@ -48,6 +48,7 @@ FR_CODE = 128028 @require_sentencepiece class M2M100TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/m2m100_418M" tokenizer_class = M2M100Tokenizer test_rust_tokenizer = False test_seq2seq = False diff --git a/tests/models/marian/test_tokenization_marian.py b/tests/models/marian/test_tokenization_marian.py index 6fb3c9a85d0..3ef85e24de6 100644 --- a/tests/models/marian/test_tokenization_marian.py +++ b/tests/models/marian/test_tokenization_marian.py @@ -45,6 +45,7 @@ else: @require_sentencepiece class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "Helsinki-NLP/opus-mt-en-de" tokenizer_class = MarianTokenizer test_rust_tokenizer = False test_sentencepiece = True diff --git a/tests/models/markuplm/test_tokenization_markuplm.py b/tests/models/markuplm/test_tokenization_markuplm.py index e793a9a5070..df1b9ed0838 100644 --- a/tests/models/markuplm/test_tokenization_markuplm.py +++ b/tests/models/markuplm/test_tokenization_markuplm.py @@ -41,6 +41,7 @@ logger = logging.get_logger(__name__) @require_tokenizers class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/markuplm-base" tokenizer_class = MarkupLMTokenizer rust_tokenizer_class = MarkupLMTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/mbart/test_tokenization_mbart.py b/tests/models/mbart/test_tokenization_mbart.py index e5a9d8c07f2..635be07aa1c 100644 --- a/tests/models/mbart/test_tokenization_mbart.py +++ b/tests/models/mbart/test_tokenization_mbart.py @@ -41,6 +41,7 @@ RO_CODE = 250020 @require_sentencepiece @require_tokenizers class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/mbart-large-en-ro" tokenizer_class = MBartTokenizer rust_tokenizer_class = MBartTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/mbart50/test_tokenization_mbart50.py b/tests/models/mbart50/test_tokenization_mbart50.py index a5ba802b6c3..799cd8afc3e 100644 --- a/tests/models/mbart50/test_tokenization_mbart50.py +++ b/tests/models/mbart50/test_tokenization_mbart50.py @@ -41,6 +41,7 @@ RO_CODE = 250020 @require_sentencepiece @require_tokenizers class MBart50TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/mbart-large-50-one-to-many-mmt" tokenizer_class = MBart50Tokenizer rust_tokenizer_class = MBart50TokenizerFast test_rust_tokenizer = True diff --git a/tests/models/mgp_str/test_tokenization_mgp_str.py b/tests/models/mgp_str/test_tokenization_mgp_str.py index 0d0e6bb0bf1..035d43cc438 100644 --- a/tests/models/mgp_str/test_tokenization_mgp_str.py +++ b/tests/models/mgp_str/test_tokenization_mgp_str.py @@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class MgpstrTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "alibaba-damo/mgp-str-base" tokenizer_class = MgpstrTokenizer test_rust_tokenizer = False from_pretrained_kwargs = {} diff --git a/tests/models/mluke/test_tokenization_mluke.py b/tests/models/mluke/test_tokenization_mluke.py index a466ae547ce..0497fa849ca 100644 --- a/tests/models/mluke/test_tokenization_mluke.py +++ b/tests/models/mluke/test_tokenization_mluke.py @@ -28,6 +28,7 @@ SAMPLE_ENTITY_VOCAB = get_tests_dir("fixtures/test_entity_vocab.json") class MLukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "studio-ousia/mluke-base" tokenizer_class = MLukeTokenizer test_rust_tokenizer = False from_pretrained_kwargs = {"cls_token": ""} diff --git a/tests/models/mobilebert/test_tokenization_mobilebert.py b/tests/models/mobilebert/test_tokenization_mobilebert.py index 92ddd88684b..4d5e09e08d1 100644 --- a/tests/models/mobilebert/test_tokenization_mobilebert.py +++ b/tests/models/mobilebert/test_tokenization_mobilebert.py @@ -34,6 +34,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english @require_tokenizers class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "mobilebert-uncased" tokenizer_class = MobileBertTokenizer rust_tokenizer_class = MobileBertTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/mpnet/test_tokenization_mpnet.py b/tests/models/mpnet/test_tokenization_mpnet.py index e30dd3a9145..b63dc7ab644 100644 --- a/tests/models/mpnet/test_tokenization_mpnet.py +++ b/tests/models/mpnet/test_tokenization_mpnet.py @@ -26,6 +26,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class MPNetTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/mpnet-base" tokenizer_class = MPNetTokenizer rust_tokenizer_class = MPNetTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/mvp/test_tokenization_mvp.py b/tests/models/mvp/test_tokenization_mvp.py index 8bddb8443b6..a442848d4d5 100644 --- a/tests/models/mvp/test_tokenization_mvp.py +++ b/tests/models/mvp/test_tokenization_mvp.py @@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_det @require_tokenizers class TestTokenizationMvp(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "RUCAIBox/mvp" tokenizer_class = MvpTokenizer rust_tokenizer_class = MvpTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/nllb/test_tokenization_nllb.py b/tests/models/nllb/test_tokenization_nllb.py index 4446522f9d2..92134c3f8ba 100644 --- a/tests/models/nllb/test_tokenization_nllb.py +++ b/tests/models/nllb/test_tokenization_nllb.py @@ -49,6 +49,7 @@ RO_CODE = 256145 @require_sentencepiece @require_tokenizers class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/nllb-200-distilled-600M" tokenizer_class = NllbTokenizer rust_tokenizer_class = NllbTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/nougat/test_tokenization_nougat.py b/tests/models/nougat/test_tokenization_nougat.py index bfb1090dada..088ce56f6e6 100644 --- a/tests/models/nougat/test_tokenization_nougat.py +++ b/tests/models/nougat/test_tokenization_nougat.py @@ -24,6 +24,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class NougatTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/nougat-base" slow_tokenizer_class = None rust_tokenizer_class = NougatTokenizerFast tokenizer_class = NougatTokenizerFast diff --git a/tests/models/openai/test_tokenization_openai.py b/tests/models/openai/test_tokenization_openai.py index 26030632918..1f5ef5a35b3 100644 --- a/tests/models/openai/test_tokenization_openai.py +++ b/tests/models/openai/test_tokenization_openai.py @@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "openai-community/openai-gpt" """Tests OpenAIGPTTokenizer that uses BERT BasicTokenizer.""" tokenizer_class = OpenAIGPTTokenizer diff --git a/tests/models/pegasus/test_tokenization_pegasus.py b/tests/models/pegasus/test_tokenization_pegasus.py index 3abe36d1183..66a68a97fc7 100644 --- a/tests/models/pegasus/test_tokenization_pegasus.py +++ b/tests/models/pegasus/test_tokenization_pegasus.py @@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_no_bos.model") @require_sentencepiece @require_tokenizers class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/pegasus-xsum" tokenizer_class = PegasusTokenizer rust_tokenizer_class = PegasusTokenizerFast test_rust_tokenizer = True @@ -135,6 +136,7 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): @require_sentencepiece @require_tokenizers class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/pegasus-xsum" tokenizer_class = PegasusTokenizer rust_tokenizer_class = PegasusTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/perceiver/test_tokenization_perceiver.py b/tests/models/perceiver/test_tokenization_perceiver.py index 6d9a9bd8639..b5d149e5f29 100644 --- a/tests/models/perceiver/test_tokenization_perceiver.py +++ b/tests/models/perceiver/test_tokenization_perceiver.py @@ -36,6 +36,7 @@ else: class PerceiverTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "deepmind/language-perceiver" tokenizer_class = PerceiverTokenizer test_rust_tokenizer = False diff --git a/tests/models/phobert/test_tokenization_phobert.py b/tests/models/phobert/test_tokenization_phobert.py index 6624957531b..bdf02d5f51a 100644 --- a/tests/models/phobert/test_tokenization_phobert.py +++ b/tests/models/phobert/test_tokenization_phobert.py @@ -22,6 +22,7 @@ from ...test_tokenization_common import TokenizerTesterMixin class PhobertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "vinai/phobert-base" tokenizer_class = PhobertTokenizer test_rust_tokenizer = False diff --git a/tests/models/plbart/test_tokenization_plbart.py b/tests/models/plbart/test_tokenization_plbart.py index f9cc38e0de6..ff0ef386e37 100644 --- a/tests/models/plbart/test_tokenization_plbart.py +++ b/tests/models/plbart/test_tokenization_plbart.py @@ -40,6 +40,7 @@ PYTHON_CODE = 50002 @require_sentencepiece @require_tokenizers class PLBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "uclanlp/plbart-base" tokenizer_class = PLBartTokenizer rust_tokenizer_class = None test_rust_tokenizer = False diff --git a/tests/models/prophetnet/test_tokenization_prophetnet.py b/tests/models/prophetnet/test_tokenization_prophetnet.py index cf4317b3a66..09390db48bf 100644 --- a/tests/models/prophetnet/test_tokenization_prophetnet.py +++ b/tests/models/prophetnet/test_tokenization_prophetnet.py @@ -32,6 +32,7 @@ from ...test_tokenization_common import TokenizerTesterMixin class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/prophetnet-large-uncased" tokenizer_class = ProphetNetTokenizer test_rust_tokenizer = False diff --git a/tests/models/qwen2/test_tokenization_qwen2.py b/tests/models/qwen2/test_tokenization_qwen2.py index 49c62b5241a..3193141b845 100644 --- a/tests/models/qwen2/test_tokenization_qwen2.py +++ b/tests/models/qwen2/test_tokenization_qwen2.py @@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "qwen/qwen-tokenizer" tokenizer_class = Qwen2Tokenizer rust_tokenizer_class = Qwen2TokenizerFast test_slow_tokenizer = True diff --git a/tests/models/realm/test_tokenization_realm.py b/tests/models/realm/test_tokenization_realm.py index 7dbd8df6ef2..f963fb347fa 100644 --- a/tests/models/realm/test_tokenization_realm.py +++ b/tests/models/realm/test_tokenization_realm.py @@ -33,6 +33,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english @require_tokenizers class RealmTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/realm-cc-news-pretrained-embedder" tokenizer_class = RealmTokenizer rust_tokenizer_class = RealmTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/reformer/test_tokenization_reformer.py b/tests/models/reformer/test_tokenization_reformer.py index 0f72bf311a2..ee9bd52f435 100644 --- a/tests/models/reformer/test_tokenization_reformer.py +++ b/tests/models/reformer/test_tokenization_reformer.py @@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_sentencepiece @require_tokenizers class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/reformer-crime-and-punishment" tokenizer_class = ReformerTokenizer rust_tokenizer_class = ReformerTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/rembert/test_tokenization_rembert.py b/tests/models/rembert/test_tokenization_rembert.py index 12c43643be7..5f65629213e 100644 --- a/tests/models/rembert/test_tokenization_rembert.py +++ b/tests/models/rembert/test_tokenization_rembert.py @@ -32,6 +32,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_sentencepiece @require_tokenizers class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/rembert" tokenizer_class = RemBertTokenizer rust_tokenizer_class = RemBertTokenizerFast space_between_special_tokens = True diff --git a/tests/models/roberta/test_tokenization_roberta.py b/tests/models/roberta/test_tokenization_roberta.py index 5d457c4cb44..83f444d1629 100644 --- a/tests/models/roberta/test_tokenization_roberta.py +++ b/tests/models/roberta/test_tokenization_roberta.py @@ -28,6 +28,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "FacebookAI/roberta-base" tokenizer_class = RobertaTokenizer rust_tokenizer_class = RobertaTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/roc_bert/test_tokenization_roc_bert.py b/tests/models/roc_bert/test_tokenization_roc_bert.py index 6a24514b3c2..4fb2b172d66 100644 --- a/tests/models/roc_bert/test_tokenization_roc_bert.py +++ b/tests/models/roc_bert/test_tokenization_roc_bert.py @@ -34,6 +34,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english @require_tokenizers class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "weiweishi/roc-bert-base-zh" tokenizer_class = RoCBertTokenizer rust_tokenizer_class = None test_rust_tokenizer = False diff --git a/tests/models/roformer/test_tokenization_roformer.py b/tests/models/roformer/test_tokenization_roformer.py index 3af411b6a80..c1096081070 100644 --- a/tests/models/roformer/test_tokenization_roformer.py +++ b/tests/models/roformer/test_tokenization_roformer.py @@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_rjieba @require_tokenizers class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "junnyu/roformer_chinese_small" tokenizer_class = RoFormerTokenizer rust_tokenizer_class = RoFormerTokenizerFast space_between_special_tokens = True diff --git a/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py b/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py index c7d16796c4c..2e65d01ead8 100644 --- a/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py +++ b/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py @@ -53,6 +53,7 @@ SMALL_TRAINING_CORPUS = [ @require_sentencepiece @require_tokenizers class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/hf-seamless-m4t-medium" tokenizer_class = SeamlessM4TTokenizer rust_tokenizer_class = SeamlessM4TTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/siglip/test_tokenization_siglip.py b/tests/models/siglip/test_tokenization_siglip.py index 586d6b0089c..fb3cb5b3f10 100644 --- a/tests/models/siglip/test_tokenization_siglip.py +++ b/tests/models/siglip/test_tokenization_siglip.py @@ -38,6 +38,7 @@ else: @require_sentencepiece @require_tokenizers class SiglipTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/siglip-base-patch16-224" tokenizer_class = SiglipTokenizer test_rust_tokenizer = False test_sentencepiece = True diff --git a/tests/models/speech_to_text/test_tokenization_speech_to_text.py b/tests/models/speech_to_text/test_tokenization_speech_to_text.py index b0cb1acc856..6bea58ddfcf 100644 --- a/tests/models/speech_to_text/test_tokenization_speech_to_text.py +++ b/tests/models/speech_to_text/test_tokenization_speech_to_text.py @@ -37,6 +37,7 @@ ES_CODE = 10 @require_sentencepiece @require_tokenizers class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/s2t-small-librispeech-asr" tokenizer_class = Speech2TextTokenizer test_rust_tokenizer = False test_sentencepiece = True diff --git a/tests/models/speech_to_text_2/test_tokenization_speech_to_text_2.py b/tests/models/speech_to_text_2/test_tokenization_speech_to_text_2.py index 1000cce2898..df433d67d96 100644 --- a/tests/models/speech_to_text_2/test_tokenization_speech_to_text_2.py +++ b/tests/models/speech_to_text_2/test_tokenization_speech_to_text_2.py @@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/s2t-wav2vec2-large-en-de" tokenizer_class = Speech2Text2Tokenizer test_rust_tokenizer = False diff --git a/tests/models/speecht5/test_tokenization_speecht5.py b/tests/models/speecht5/test_tokenization_speecht5.py index a8af8d274a3..d007b14dd22 100644 --- a/tests/models/speecht5/test_tokenization_speecht5.py +++ b/tests/models/speecht5/test_tokenization_speecht5.py @@ -30,6 +30,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe_char.model") @require_sentencepiece @require_tokenizers class SpeechT5TokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/speecht5_asr" tokenizer_class = SpeechT5Tokenizer test_rust_tokenizer = False test_sentencepiece = True diff --git a/tests/models/squeezebert/test_tokenization_squeezebert.py b/tests/models/squeezebert/test_tokenization_squeezebert.py index a6586255640..3ac24e8374b 100644 --- a/tests/models/squeezebert/test_tokenization_squeezebert.py +++ b/tests/models/squeezebert/test_tokenization_squeezebert.py @@ -25,6 +25,7 @@ class SqueezeBertTokenizationTest(BertTokenizationTest): tokenizer_class = SqueezeBertTokenizer rust_tokenizer_class = SqueezeBertTokenizerFast test_rust_tokenizer = True + from_pretrained_id = "squeezebert/squeezebert-uncased" def get_rust_tokenizer(self, **kwargs): return SqueezeBertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py index b0755dc1ba0..388388ff238 100644 --- a/tests/models/t5/test_tokenization_t5.py +++ b/tests/models/t5/test_tokenization_t5.py @@ -38,6 +38,7 @@ else: @require_sentencepiece @require_tokenizers class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google-t5/t5-small" tokenizer_class = T5Tokenizer rust_tokenizer_class = T5TokenizerFast test_rust_tokenizer = True diff --git a/tests/models/tapas/test_tokenization_tapas.py b/tests/models/tapas/test_tokenization_tapas.py index 692dc91b6d8..9aa3ad6d582 100644 --- a/tests/models/tapas/test_tokenization_tapas.py +++ b/tests/models/tapas/test_tokenization_tapas.py @@ -53,6 +53,7 @@ else: @require_tokenizers @require_pandas class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/tapas-large-finetuned-sqa" tokenizer_class = TapasTokenizer test_rust_tokenizer = False space_between_special_tokens = True diff --git a/tests/models/udop/test_tokenization_udop.py b/tests/models/udop/test_tokenization_udop.py index cc9a2f28520..f9ad6c7abec 100644 --- a/tests/models/udop/test_tokenization_udop.py +++ b/tests/models/udop/test_tokenization_udop.py @@ -54,6 +54,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_tokenizers @require_pandas class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/udop-large" tokenizer_class = UdopTokenizer rust_tokenizer_class = UdopTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/vits/test_tokenization_vits.py b/tests/models/vits/test_tokenization_vits.py index c02caaaa908..fee6bac3a4f 100644 --- a/tests/models/vits/test_tokenization_vits.py +++ b/tests/models/vits/test_tokenization_vits.py @@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/mms-tts-eng" tokenizer_class = VitsTokenizer test_rust_tokenizer = False diff --git a/tests/models/wav2vec2/test_tokenization_wav2vec2.py b/tests/models/wav2vec2/test_tokenization_wav2vec2.py index 3ab8717d819..7310b148484 100644 --- a/tests/models/wav2vec2/test_tokenization_wav2vec2.py +++ b/tests/models/wav2vec2/test_tokenization_wav2vec2.py @@ -367,6 +367,7 @@ class Wav2Vec2TokenizerTest(unittest.TestCase): class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/wav2vec2-base-960h" tokenizer_class = Wav2Vec2CTCTokenizer test_rust_tokenizer = False diff --git a/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py b/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py index 0411a863bc7..56e38f2cf5d 100644 --- a/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py +++ b/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py @@ -28,6 +28,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_phonemizer class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/wav2vec2-lv-60-espeak-cv-ft" tokenizer_class = Wav2Vec2PhonemeCTCTokenizer test_rust_tokenizer = False diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py index 170857cffb9..bb22a36f084 100644 --- a/tests/models/whisper/test_tokenization_whisper.py +++ b/tests/models/whisper/test_tokenization_whisper.py @@ -31,6 +31,7 @@ NOTIMESTAMPS = 50363 class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "openai/whisper-tiny" tokenizer_class = WhisperTokenizer rust_tokenizer_class = WhisperTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/xglm/test_tokenization_xglm.py b/tests/models/xglm/test_tokenization_xglm.py index 61674976a38..02c58681d10 100644 --- a/tests/models/xglm/test_tokenization_xglm.py +++ b/tests/models/xglm/test_tokenization_xglm.py @@ -31,6 +31,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_sentencepiece @require_tokenizers class XGLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/xglm-564M" tokenizer_class = XGLMTokenizer rust_tokenizer_class = XGLMTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/xlm/test_tokenization_xlm.py b/tests/models/xlm/test_tokenization_xlm.py index 4b5982ca985..6bc7fedad48 100644 --- a/tests/models/xlm/test_tokenization_xlm.py +++ b/tests/models/xlm/test_tokenization_xlm.py @@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "FacebookAI/xlm-mlm-en-2048" tokenizer_class = XLMTokenizer test_rust_tokenizer = False diff --git a/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py b/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py index 679e808dc97..cadcc600490 100644 --- a/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py +++ b/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py @@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_sentencepiece class XLMProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/xprophetnet-large-wiki100-cased" tokenizer_class = XLMProphetNetTokenizer test_rust_tokenizer = False test_sentencepiece = True diff --git a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py index 6e2d4446a02..8c3674460da 100644 --- a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py +++ b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py @@ -31,6 +31,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_sentencepiece @require_tokenizers class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "FacebookAI/xlm-roberta-base" tokenizer_class = XLMRobertaTokenizer rust_tokenizer_class = XLMRobertaTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/xlnet/test_tokenization_xlnet.py b/tests/models/xlnet/test_tokenization_xlnet.py index 8a7476fad92..bd65e6c80b3 100644 --- a/tests/models/xlnet/test_tokenization_xlnet.py +++ b/tests/models/xlnet/test_tokenization_xlnet.py @@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_sentencepiece @require_tokenizers class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "xlnet/xlnet-base-cased" tokenizer_class = XLNetTokenizer rust_tokenizer_class = XLNetTokenizerFast test_rust_tokenizer = True diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index d0c58749114..6c900fa72cd 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -186,6 +186,7 @@ class TokenizerTesterMixin: space_between_special_tokens = False from_pretrained_kwargs = None from_pretrained_filter = None + from_pretrained_id = None from_pretrained_vocab_key = "vocab_file" test_seq2seq = True @@ -200,19 +201,13 @@ class TokenizerTesterMixin: # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the # information available in Tokenizer (name, rust class, python class, vocab key name) if self.test_rust_tokenizer: - tokenizers_list = [ + self.tokenizers_list = [ ( self.rust_tokenizer_class, - pretrained_name, + self.from_pretrained_id, self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {}, ) - for pretrained_name in self.rust_tokenizer_class.pretrained_vocab_files_map[ - self.from_pretrained_vocab_key - ].keys() - if self.from_pretrained_filter is None - or (self.from_pretrained_filter is not None and self.from_pretrained_filter(pretrained_name)) ] - self.tokenizers_list = tokenizers_list[:1] # Let's just test the first pretrained vocab for speed else: self.tokenizers_list = [] with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data: