mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-03 12:50:06 +06:00
Conversion from slow to fast for BPE spm vocabs contained an error. (#10120)
* Conversion from slow to fast for BPE spm vocabs contained an error. - There is only 1 test currently (tokenizers + slow) that used the modified path and it's reformer, which does not contain any ids modification so the bug was silent for now. - The real issue is that vocab variable was overloaded by SentencePieceExtractor, leading to Slow specific vocab oddities to be completely ignored - The bug was reported here https://github.com/huggingface/transformers/issues/9518 - Ran the complete tokenization test suite with slow without error (`RUN_SLOW=1 pytest -sv tests/test_tokenization_*`) * Remove rebase error. * Adding the fixture.
This commit is contained in:
parent
dd3a7f9641
commit
c9837a0d27
@ -322,10 +322,11 @@ class SpmConverter(Converter):
|
||||
if model_type == 1:
|
||||
tokenizer = Tokenizer(Unigram(vocab, unk_id))
|
||||
elif model_type == 2:
|
||||
vocab, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract()
|
||||
_, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract()
|
||||
bpe_vocab = {word: i for i, (word, score) in enumerate(vocab)}
|
||||
tokenizer = Tokenizer(
|
||||
BPE(
|
||||
vocab,
|
||||
bpe_vocab,
|
||||
merges,
|
||||
unk_token=proto.trainer_spec.unk_piece,
|
||||
fuse_unk=True,
|
||||
@ -424,9 +425,10 @@ class CamembertConverter(SpmConverter):
|
||||
("<pad>", 0.0),
|
||||
("</s>NOTUSED", 0.0),
|
||||
("<unk>", 0.0),
|
||||
("<unk>NOTUSED", -100),
|
||||
]
|
||||
# We down-grade the original SentencePiece by -100 to avoid using it and use our added token instead
|
||||
vocab += [(piece.piece, piece.score if i != 0 else piece.score - 100) for i, piece in enumerate(proto.pieces)]
|
||||
vocab += [(piece.piece, piece.score) for piece in proto.pieces[1:]]
|
||||
vocab += [("<mask>", 0.0)]
|
||||
return vocab
|
||||
|
||||
|
BIN
tests/fixtures/test_sentencepiece_bpe.model
vendored
Normal file
BIN
tests/fixtures/test_sentencepiece_bpe.model
vendored
Normal file
Binary file not shown.
@ -25,6 +25,7 @@ from .test_tokenization_common import TokenizerTesterMixin
|
||||
|
||||
|
||||
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
|
||||
SAMPLE_BPE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece_bpe.model")
|
||||
|
||||
FRAMEWORK = "pt" if is_torch_available() else "tf"
|
||||
|
||||
@ -44,6 +45,28 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer = CamembertTokenizer(SAMPLE_VOCAB)
|
||||
tokenizer.save_pretrained(self.tmpdirname)
|
||||
|
||||
def test_rust_and_python_bpe_tokenizers(self):
|
||||
tokenizer = CamembertTokenizer(SAMPLE_BPE_VOCAB)
|
||||
tokenizer.save_pretrained(self.tmpdirname)
|
||||
rust_tokenizer = CamembertTokenizerFast.from_pretrained(self.tmpdirname)
|
||||
|
||||
sequence = "I was born in 92000, and this is falsé."
|
||||
|
||||
ids = tokenizer.encode(sequence)
|
||||
rust_ids = rust_tokenizer.encode(sequence)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
ids = tokenizer.encode(sequence, add_special_tokens=False)
|
||||
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
# <unk> tokens are not the same for `rust` than for `slow`.
|
||||
# Because spm gives back raw token instead of `unk` in EncodeAsPieces
|
||||
# tokens = tokenizer.tokenize(sequence)
|
||||
tokens = tokenizer.convert_ids_to_tokens(ids)
|
||||
rust_tokens = rust_tokenizer.tokenize(sequence)
|
||||
self.assertListEqual(tokens, rust_tokens)
|
||||
|
||||
def test_rust_and_python_full_tokenizers(self):
|
||||
if not self.test_rust_tokenizer:
|
||||
return
|
||||
|
Loading…
Reference in New Issue
Block a user