mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
Fix bloom add prefix space (#25652)
* properly support Sequence of pretokenizers * actual fix * make sure the fix works. Tests are not working for sure! * hacky way * add TODO * update * add a todo * nits * rename test * nits * rename test
This commit is contained in:
parent
62396cff46
commit
e20fab0bbe
@ -135,7 +135,7 @@ class BloomTokenizerFast(PreTrainedTokenizerFast):
|
||||
|
||||
if add_prefix_space:
|
||||
pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
|
||||
decoder_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
|
||||
decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
|
||||
self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state)
|
||||
self.backend_tokenizer.decoder = pickle.loads(decoder_state)
|
||||
|
||||
|
@ -133,3 +133,10 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
# maximum sequence length of the positoonal embeddings.
|
||||
self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1)
|
||||
self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1)
|
||||
|
||||
def test_add_prefix_space_fast(self):
|
||||
tokenizer_w_prefix = self.get_rust_tokenizer(add_prefix_space=True)
|
||||
tokenizer_wo_prefix = self.get_rust_tokenizer(add_prefix_space=False)
|
||||
tokens_w_prefix = tokenizer_w_prefix.tokenize("Hey")
|
||||
tokens_wo_prefix = tokenizer_wo_prefix.tokenize("Hey")
|
||||
self.assertNotEqual(tokens_w_prefix, tokens_wo_prefix)
|
||||
|
Loading…
Reference in New Issue
Block a user