Fix bloom add prefix space (#25652)

* properly support Sequence of pretokenizers

* actual fix

* make sure the fix works. Tests are not working for sure!

* hacky way

* add TODO

* update

* add a todo

* nits

* rename test

* nits

* rename test
This commit is contained in:
Arthur 2023-08-22 14:50:12 +02:00 committed by GitHub
parent 62396cff46
commit e20fab0bbe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 8 additions and 1 deletions

View File

@ -135,7 +135,7 @@ class BloomTokenizerFast(PreTrainedTokenizerFast):
if add_prefix_space:
pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
decoder_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state)
self.backend_tokenizer.decoder = pickle.loads(decoder_state)

View File

@ -133,3 +133,10 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# maximum sequence length of the positoonal embeddings.
self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1)
self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1)
def test_add_prefix_space_fast(self):
tokenizer_w_prefix = self.get_rust_tokenizer(add_prefix_space=True)
tokenizer_wo_prefix = self.get_rust_tokenizer(add_prefix_space=False)
tokens_w_prefix = tokenizer_w_prefix.tokenize("Hey")
tokens_wo_prefix = tokenizer_wo_prefix.tokenize("Hey")
self.assertNotEqual(tokens_w_prefix, tokens_wo_prefix)