diff --git a/scripts/fsmt/fsmt-make-super-tiny-model.py b/scripts/fsmt/fsmt-make-super-tiny-model.py new file mode 100755 index 00000000000..2521799b63d --- /dev/null +++ b/scripts/fsmt/fsmt-make-super-tiny-model.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python +# coding: utf-8 + +# This script creates a super tiny model that is useful inside tests, when we just want to test that +# the machinery works, without needing to the check the quality of the outcomes. +# +# This version creates a tiny vocab first, and then a tiny model - so the outcome is truly tiny - +# all files ~60KB. As compared to taking a full-size model, reducing to the minimum its layers and +# emb dimensions, but keeping the full vocab + merges files, leading to ~3MB in total for all files. +# The latter is done by `fsmt-make-super-tiny-model.py`. +# +# It will be used then as "stas/tiny-wmt19-en-ru" + +from pathlib import Path +import json +import tempfile + +from transformers import FSMTTokenizer, FSMTConfig, FSMTForConditionalGeneration +from transformers.tokenization_fsmt import VOCAB_FILES_NAMES + +mname_tiny = "tiny-wmt19-en-ru" + +# Build + +# borrowed from a test +vocab = [ "l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "w", "r", "t", "lo", "low", "er", "low", "lowest", "newer", "wider", "", ] +vocab_tokens = dict(zip(vocab, range(len(vocab)))) +merges = ["l o 123", "lo w 1456", "e r 1789", ""] + +with tempfile.TemporaryDirectory() as tmpdirname: + build_dir = Path(tmpdirname) + src_vocab_file = build_dir / VOCAB_FILES_NAMES["src_vocab_file"] + tgt_vocab_file = build_dir / VOCAB_FILES_NAMES["tgt_vocab_file"] + merges_file = build_dir / VOCAB_FILES_NAMES["merges_file"] + with open(src_vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens)) + with open(tgt_vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens)) + with open(merges_file, "w") as fp : fp.write("\n".join(merges)) + + tokenizer = FSMTTokenizer( + langs=["en", "ru"], + src_vocab_size = len(vocab), + tgt_vocab_size = len(vocab), + src_vocab_file=src_vocab_file, + tgt_vocab_file=tgt_vocab_file, + merges_file=merges_file, + ) + +config = FSMTConfig( + langs=['ru', 'en'], + src_vocab_size=1000, tgt_vocab_size=1000, + d_model=4, + encoder_layers=1, decoder_layers=1, + encoder_ffn_dim=4, decoder_ffn_dim=4, + encoder_attention_heads=1, decoder_attention_heads=1, +) + +tiny_model = FSMTForConditionalGeneration(config) +print(f"num of params {tiny_model.num_parameters()}") + +# Test +batch = tokenizer.prepare_seq2seq_batch(["Making tiny model"]) +outputs = tiny_model(**batch, return_dict=True) + +print("test output:", len(outputs.logits[0])) + +# Save +tiny_model.half() # makes it smaller +tiny_model.save_pretrained(mname_tiny) +tokenizer.save_pretrained(mname_tiny) + +print(f"Generated {mname_tiny}") + +# Upload +# transformers-cli upload tiny-wmt19-en-ru diff --git a/scripts/fsmt/fsmt-make-tiny-model.py b/scripts/fsmt/fsmt-make-tiny-model.py index d51cbba4177..ba8abe0139c 100755 --- a/scripts/fsmt/fsmt-make-tiny-model.py +++ b/scripts/fsmt/fsmt-make-tiny-model.py @@ -1,10 +1,19 @@ #!/usr/bin/env python # coding: utf-8 -# this script creates a tiny model that is useful inside tests, when we just want to test that the machinery works, -# without needing to the check the quality of the outcomes. -# it will be used then as "stas/tiny-wmt19-en-de" +# This script creates a super tiny model that is useful inside tests, when we just want to test that +# the machinery works, without needing to the check the quality of the outcomes. +# +# This version creates a tiny model through reduction of a normal pre-trained model, but keeping the +# full vocab, merges file, and thus also resulting in a larger model due to a large vocab size. +# This gives ~3MB in total for all files. +# +# If you want a 50 times smaller than this see `fsmt-make-super-tiny-model.py`, which is slightly more complicated +# +# +# It will be used then as "stas/tiny-wmt19-en-de" +# Build from transformers import FSMTTokenizer, FSMTConfig, FSMTForConditionalGeneration mname = "facebook/wmt19-en-de" tokenizer = FSMTTokenizer.from_pretrained(mname) @@ -18,16 +27,20 @@ config.update(dict( tiny_model = FSMTForConditionalGeneration(config) print(f"num of params {tiny_model.num_parameters()}") -# Test it + +# Test batch = tokenizer.prepare_seq2seq_batch(["Making tiny model"]) outputs = tiny_model(**batch, return_dict=True) -print(len(outputs.logits[0])) +print("test output:", len(outputs.logits[0])) + # Save mname_tiny = "tiny-wmt19-en-de" tiny_model.half() # makes it smaller tiny_model.save_pretrained(mname_tiny) tokenizer.save_pretrained(mname_tiny) +print(f"Generated {mname_tiny}") + # Upload # transformers-cli upload tiny-wmt19-en-de diff --git a/tests/test_tokenization_fsmt.py b/tests/test_tokenization_fsmt.py index c3e08d566ad..dfa39dbd9e4 100644 --- a/tests/test_tokenization_fsmt.py +++ b/tests/test_tokenization_fsmt.py @@ -25,6 +25,10 @@ from transformers.tokenization_fsmt import VOCAB_FILES_NAMES, FSMTTokenizer from .test_tokenization_common import TokenizerTesterMixin +# using a different tiny model than the one used for default params defined in init to ensure proper testing +FSMT_TINY2 = "stas/tiny-wmt19-en-ru" + + class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = FSMTTokenizer @@ -86,6 +90,15 @@ class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def tokenizer_en_ru(self): return FSMTTokenizer.from_pretrained("facebook/wmt19-en-ru") + def test_online_tokenizer_config(self): + """this just tests that the online tokenizer files get correctly fetched and + loaded via its tokenizer_config.json and it's not slow so it's run by normal CI + """ + tokenizer = FSMTTokenizer.from_pretrained(FSMT_TINY2) + self.assertListEqual([tokenizer.src_lang, tokenizer.tgt_lang], ["en", "ru"]) + self.assertEqual(tokenizer.src_vocab_size, 21) + self.assertEqual(tokenizer.tgt_vocab_size, 21) + def test_full_tokenizer(self): """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """ tokenizer = FSMTTokenizer(self.langs, self.src_vocab_file, self.tgt_vocab_file, self.merges_file)