Overwrite get_clean_sequence as this was causing a bottleneck (#13183)

This commit is contained in:
NielsRogge 2021-08-23 09:41:35 +02:00 committed by GitHub
parent 143738214c
commit 588e6caa15
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -15,6 +15,7 @@
import unittest
from typing import Tuple
from transformers import AddedToken, LukeTokenizer
from transformers.testing_utils import require_torch, slow
@ -81,6 +82,11 @@ class Luke(TokenizerTesterMixin, unittest.TestCase):
assert encoded_sentence == encoded_text_from_decode
assert encoded_pair == encoded_pair_from_decode
def get_clean_sequence(self, tokenizer, max_length=20) -> Tuple[str, list]:
txt = "Beyonce lives in Los Angeles"
ids = tokenizer.encode(txt, add_special_tokens=False)
return txt, ids
def test_space_encoding(self):
tokenizer = self.get_tokenizer()