mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
Optimize t5 tokenize logic to avoid redundant calls (#32270)
* Optimize t5 tokenize logic to avoid redundant calls * fix and overwrite copies
This commit is contained in:
parent
f2122cc6eb
commit
5019aabfac
@ -261,9 +261,8 @@ class LlamaTokenizer(PreTrainedTokenizer):
|
||||
`unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
|
||||
`self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
|
||||
"""
|
||||
tokens = self.sp_model.encode(text, out_type=str)
|
||||
if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
|
||||
return tokens
|
||||
return self.sp_model.encode(text, out_type=str)
|
||||
|
||||
# 1. Encode string + prefix ex: "<unk> Hey"
|
||||
tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
|
||||
|
@ -463,9 +463,8 @@ class SeamlessM4TTokenizer(PreTrainedTokenizer):
|
||||
`unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
|
||||
`self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
|
||||
"""
|
||||
tokens = self.sp_model.encode(text, out_type=str)
|
||||
if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
|
||||
return tokens
|
||||
return self.sp_model.encode(text, out_type=str)
|
||||
|
||||
# 1. Encode string + prefix ex: "<unk> Hey"
|
||||
tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
|
||||
|
@ -389,9 +389,8 @@ class T5Tokenizer(PreTrainedTokenizer):
|
||||
`unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
|
||||
`self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
|
||||
"""
|
||||
tokens = self.sp_model.encode(text, out_type=str)
|
||||
if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
|
||||
return tokens
|
||||
return self.sp_model.encode(text, out_type=str)
|
||||
|
||||
# 1. Encode string + prefix ex: "<unk> Hey"
|
||||
tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
|
||||
|
@ -446,9 +446,8 @@ class UdopTokenizer(PreTrainedTokenizer):
|
||||
`unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
|
||||
`self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
|
||||
"""
|
||||
tokens = self.sp_model.encode(text, out_type=str)
|
||||
if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
|
||||
return tokens
|
||||
return self.sp_model.encode(text, out_type=str)
|
||||
|
||||
# 1. Encode string + prefix ex: "<unk> Hey"
|
||||
tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
|
||||
|
Loading…
Reference in New Issue
Block a user