gpt-2 tokenizer

This commit is contained in:
thomwolf 2019-05-01 11:40:48 +02:00
parent cd110835a0
commit db98a4a48b

View File

@ -221,7 +221,10 @@ class GPT2Tokenizer(object):
""" Tokenize a string. """
bpe_tokens = []
for token in re.findall(self.pat, text):
token = ''.join(self.byte_encoder[ord(b)] for b in token)
if sys.version_info[0] == 2:
token = ''.join(self.byte_encoder[ord(b)] for b in token)
else:
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
return bpe_tokens