From 5019aabfacf7599b9a6b4e7a1adc1fb5c9017727 Mon Sep 17 00:00:00 2001 From: leejet Date: Mon, 29 Jul 2024 15:51:43 +0800 Subject: [PATCH] Optimize t5 tokenize logic to avoid redundant calls (#32270) * Optimize t5 tokenize logic to avoid redundant calls * fix and overwrite copies --- src/transformers/models/llama/tokenization_llama.py | 3 +-- .../models/seamless_m4t/tokenization_seamless_m4t.py | 3 +-- src/transformers/models/t5/tokenization_t5.py | 3 +-- src/transformers/models/udop/tokenization_udop.py | 3 +-- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py index 385ad2d88e1..cc03c1470ee 100644 --- a/src/transformers/models/llama/tokenization_llama.py +++ b/src/transformers/models/llama/tokenization_llama.py @@ -261,9 +261,8 @@ class LlamaTokenizer(PreTrainedTokenizer): `unk_token`. Here is an example with `unk_token = ""` and `unk_token_length = 4`. `self.tokenizer.sp_model.encode(" Hey", out_type = str)[4:]`. """ - tokens = self.sp_model.encode(text, out_type=str) if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")): - return tokens + return self.sp_model.encode(text, out_type=str) # 1. Encode string + prefix ex: " Hey" tokens = self.sp_model.encode(self.unk_token + text, out_type=str) diff --git a/src/transformers/models/seamless_m4t/tokenization_seamless_m4t.py b/src/transformers/models/seamless_m4t/tokenization_seamless_m4t.py index 230283a0d4a..d6017a6e057 100644 --- a/src/transformers/models/seamless_m4t/tokenization_seamless_m4t.py +++ b/src/transformers/models/seamless_m4t/tokenization_seamless_m4t.py @@ -463,9 +463,8 @@ class SeamlessM4TTokenizer(PreTrainedTokenizer): `unk_token`. Here is an example with `unk_token = ""` and `unk_token_length = 4`. `self.tokenizer.sp_model.encode(" Hey", out_type = str)[4:]`. """ - tokens = self.sp_model.encode(text, out_type=str) if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")): - return tokens + return self.sp_model.encode(text, out_type=str) # 1. Encode string + prefix ex: " Hey" tokens = self.sp_model.encode(self.unk_token + text, out_type=str) diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py index 0f2ae101c8f..1e166a78f10 100644 --- a/src/transformers/models/t5/tokenization_t5.py +++ b/src/transformers/models/t5/tokenization_t5.py @@ -389,9 +389,8 @@ class T5Tokenizer(PreTrainedTokenizer): `unk_token`. Here is an example with `unk_token = ""` and `unk_token_length = 4`. `self.tokenizer.sp_model.encode(" Hey", out_type = str)[4:]`. """ - tokens = self.sp_model.encode(text, out_type=str) if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")): - return tokens + return self.sp_model.encode(text, out_type=str) # 1. Encode string + prefix ex: " Hey" tokens = self.sp_model.encode(self.unk_token + text, out_type=str) diff --git a/src/transformers/models/udop/tokenization_udop.py b/src/transformers/models/udop/tokenization_udop.py index 704b5c48dee..cd1e2b55312 100644 --- a/src/transformers/models/udop/tokenization_udop.py +++ b/src/transformers/models/udop/tokenization_udop.py @@ -446,9 +446,8 @@ class UdopTokenizer(PreTrainedTokenizer): `unk_token`. Here is an example with `unk_token = ""` and `unk_token_length = 4`. `self.tokenizer.sp_model.encode(" Hey", out_type = str)[4:]`. """ - tokens = self.sp_model.encode(text, out_type=str) if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")): - return tokens + return self.sp_model.encode(text, out_type=str) # 1. Encode string + prefix ex: " Hey" tokens = self.sp_model.encode(self.unk_token + text, out_type=str)