mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
gpt-2 tokenizer
This commit is contained in:
parent
cd110835a0
commit
db98a4a48b
@ -221,7 +221,10 @@ class GPT2Tokenizer(object):
|
||||
""" Tokenize a string. """
|
||||
bpe_tokens = []
|
||||
for token in re.findall(self.pat, text):
|
||||
token = ''.join(self.byte_encoder[ord(b)] for b in token)
|
||||
if sys.version_info[0] == 2:
|
||||
token = ''.join(self.byte_encoder[ord(b)] for b in token)
|
||||
else:
|
||||
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
|
||||
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
|
||||
return bpe_tokens
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user