using raw string for regex to search <extra_id> (#21162)

* using raw string for regex to search <extra_id>

* fix the same issue in test file:`tokenization_t5.py`
This commit is contained in:
Pengfei Liu 2023-01-18 09:43:54 -05:00 committed by GitHub
parent 8a17da2f7f
commit 8ad06b7c13
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 3 additions and 3 deletions

View File

@ -214,7 +214,7 @@ class T5Tokenizer(PreTrainedTokenizer):
def get_sentinel_tokens(self):
return list(
set(filter(lambda x: bool(re.search("<extra_id_\d+>", x)) is not None, self.additional_special_tokens))
set(filter(lambda x: bool(re.search(r"<extra_id_\d+>", x)) is not None, self.additional_special_tokens))
)
def get_sentinel_token_ids(self):

View File

@ -386,7 +386,7 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
sentinel_tokens = tokenizer.get_sentinel_tokens()
self.assertEquals(len(sentinel_tokens), 10)
self.assertListEqual(sorted(sentinel_tokens), sorted([f"<extra_id_{str(i)}>" for i in range(0, 10)]))
self.assertTrue([re.search("<extra_id_\d+>", token) is not None for token in sentinel_tokens])
self.assertTrue([re.search(r"<extra_id_\d+>", token) is not None for token in sentinel_tokens])
def test_get_sentinel_token_ids(self):
tokenizer = T5Tokenizer(SAMPLE_VOCAB, extra_ids=10)
@ -397,7 +397,7 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
sentinel_tokens = tokenizer.get_sentinel_tokens()
self.assertEquals(len(sentinel_tokens), 10)
self.assertListEqual(sorted(sentinel_tokens), sorted([f"<extra_id_{str(i)}>" for i in range(0, 10)]))
self.assertTrue([re.search("<extra_id_\d+>", token) is not None for token in sentinel_tokens])
self.assertTrue([re.search(r"<extra_id_\d+>", token) is not None for token in sentinel_tokens])
def test_get_sentinel_token_ids_for_fasttokenizer(self):
tokenizer = T5TokenizerFast(SAMPLE_VOCAB, extra_ids=10)