mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
[UDOP] Add special tokens to tokenizer (#29594)
* Add special tokens * Add special tokens * Use fmt * Uncomment code * Add test * Remove scripts * Address comments * Improve tests * Address comment * Remove flag
This commit is contained in:
parent
d9850abd40
commit
ecfe9be705
File diff suppressed because one or more lines are too long
@ -1893,3 +1893,31 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
self.assertListEqual(encoding_p["attention_mask"], [1, 1, 1])
|
||||
self.assertDictEqual(dict(encoding_p), dict(encoding_r))
|
||||
self.assertEqual(tokenizer_p.decode(encoding_p["input_ids"]), expected_decoding)
|
||||
|
||||
def test_special_tokens(self):
|
||||
tokenizer_p = UdopTokenizer.from_pretrained("microsoft/udop-large")
|
||||
tokenizer_r = UdopTokenizerFast.from_pretrained("microsoft/udop-large")
|
||||
|
||||
# encode
|
||||
text = "paragraph<loc_58>. Hey"
|
||||
encoding_p = tokenizer_p.encode(text)
|
||||
encoding_r = tokenizer_r.encode(text)
|
||||
|
||||
assert encoding_p == encoding_r == [8986, 32942, 3, 5, 9459, 1]
|
||||
|
||||
# decode
|
||||
# this is different between slow/fast tokenizer
|
||||
# due tothe former having `spaces_between_special_tokens=True` by default
|
||||
ids = [0, 8986, 32942, 32966, 32554, 32551, 1]
|
||||
|
||||
# test slow tokenizer
|
||||
decoding = tokenizer_p.decode(ids, spaces_between_special_tokens=False)
|
||||
|
||||
excepted_decoding = "<pad>paragraph<loc_58><loc_34><loc_446><loc_449></s>"
|
||||
assert decoding == excepted_decoding
|
||||
|
||||
# test fast tokenizer
|
||||
decoding = tokenizer_r.decode(ids)
|
||||
|
||||
excepted_decoding = "<pad> paragraph<loc_58><loc_34><loc_446><loc_449></s>"
|
||||
assert decoding == excepted_decoding
|
||||
|
Loading…
Reference in New Issue
Block a user