diff --git a/src/transformers/models/bloom/tokenization_bloom_fast.py b/src/transformers/models/bloom/tokenization_bloom_fast.py index 2be00ec4af7..8339ece5433 100644 --- a/src/transformers/models/bloom/tokenization_bloom_fast.py +++ b/src/transformers/models/bloom/tokenization_bloom_fast.py @@ -135,7 +135,7 @@ class BloomTokenizerFast(PreTrainedTokenizerFast): if add_prefix_space: pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true') - decoder_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true') + decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true') self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state) self.backend_tokenizer.decoder = pickle.loads(decoder_state) diff --git a/tests/models/bloom/test_tokenization_bloom.py b/tests/models/bloom/test_tokenization_bloom.py index 4857e2ab5fc..576a191c70b 100644 --- a/tests/models/bloom/test_tokenization_bloom.py +++ b/tests/models/bloom/test_tokenization_bloom.py @@ -133,3 +133,10 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase): # maximum sequence length of the positoonal embeddings. self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1) self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1) + + def test_add_prefix_space_fast(self): + tokenizer_w_prefix = self.get_rust_tokenizer(add_prefix_space=True) + tokenizer_wo_prefix = self.get_rust_tokenizer(add_prefix_space=False) + tokens_w_prefix = tokenizer_w_prefix.tokenize("Hey") + tokens_wo_prefix = tokenizer_wo_prefix.tokenize("Hey") + self.assertNotEqual(tokens_w_prefix, tokens_wo_prefix)