mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
🚨🚨 🚨🚨 [Tokenizer
] attemp to fix add_token issues🚨🚨 🚨🚨 (#23909)
* fix test for bart. Order is correct now let's skip BPEs * ouf * styling * fix bert.... * slow refactoring * current updates * massive refactoring * update * NICE! * update to see where I am at * updates * update * update * revert * updates * updates * start supporting legacy_save * styling * big update * revert some changes * nits * nniiiiiice * small fixes * kinda fix t5 with new behaviour * major update * fixup * fix copies * today's updates * fix byt5 * upfate * update * update * updates * update vocab size test * Barthez does not use not need the fairseq offset ids * super calll must be after * calll super * move all super init * move other super init * fixup * nits * more fixes * nits * more fixes * nits * more fix * remove useless files * ouch all of them are affected * and more! * small imporvements * no more sanitize token * more changes around unique no split tokens * partially fix more things * keep legacy save but add warning * so... more fixes * updates * guess deberta tokenizer could be nuked * fixup * fixup did some bad things * nuke it if it breaks * remove prints and pretrain fast from slow with new format. * fixups * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * fiou * nit * by default specials should not be normalized? * update * remove brakpoint * updates * a lot of updates * fixup * fixes revert some changes to match fast * small nits * that makes it cleaner * fix camembert accordingly * update * some lest breaking changes * update * fixup * fix byt5 and whisper mostly * some more fixes, canine's byte vocab * fix gpt2 * fix most of the perceiver tests (4 left) * fix layout lmv3 * fixup * fix copies for gpt2 style * make sure to only warn once * fix perciever and gpt2 tests * some more backward compatibility: also read special tokens map because some ppl use it........////..... * fixup * add else when reading * nits * fresh updates * fix copies * will this make everything faster? * fixes * more fixes * update * more fixes * fixup * is the source of truth right? * sorry camembert for the troubles * current updates * fixup * update led * update * fix regression * fix single word * more model specific fixes * fix t5 tests * fixup * more comments * update * fix nllb * rstrip removed * small fixes * better handle additional_special_tokens and vocab sizes * fixing * styling * fix 4 / 21 * fixup * fix nlbb's tests * some fixes * fix t5 * fixes * style * fix canine tests * damn this is nice * nits * m2m100 nit * fixups * fixes! * fixup * stash * fix merge * revert bad change * fixup * correct order for code Llama * fix speecht5 post merge * styling * revert source of 11 fails * small nits * all changes in one go * fnet hack * fix 2 more tests * update based on main branch of tokenizers * fixup * fix VITS issues * more fixes * fix mgp test * fix camembert issues * oups camembert still has 2 failing tests * mluke fixes * decode fixes * small nits * nits * fix llama and vits * fix camembert * smal nits * more fixes when initialising a fast from a slow and etc * fix one of the last test * fix CPM tokenizer test * fixups * fix pop2piano * fixup * ⚠️ Change tokenizers required version ⚠️ * ⚠️ Change tokenizers required version ⚠️ * "tokenizers>=0.14,<0.15", don't forget smaller than * fix musicgen tests and pretraiendtokenizerfast * fix owlvit and all * update t5 * fix 800 red * fix tests * fix the fix of the fix of t5 * styling * documentation nits * cache _added_tokens_encoder * fixups * Nit * fix red tests * one last nit! * make eveything a lot simpler * Now it's over 😉 * few small nits * Apply suggestions from code review Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * updates that work for now * tests that should no be skipped / changed and fixed next * fixup * i am ashamed * pushe the fix * update * fixups * nits * fix added_tokens_encoder * fix canine test * fix pegasus vocab * fix transfoXL * fixup * whisper needs to be fixed for train new * pegasus nits * more pegasus fixes * minor update * better error message in failed test * fix whisper failing test * fix whisper failing test * fix pegasus * fixup * fix **** pegasus * reset things * remove another file * attempts to fix the strange custome encoder and offset * nits here and there * update * fixup * nit * fix the whisper test * nits nits * Apply suggestions from code review Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * updates based on review * some small update to potentially remove * nits * import rlu cache * Update src/transformers/tokenization_utils_base.py Co-authored-by: Lysandre Debut <hi@lysand.re> * move warning to `from_pretrained` * update tests results now that the special tokens are always added --------- Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Co-authored-by: Lysandre Debut <hi@lysand.re>
This commit is contained in:
parent
835b0a0533
commit
2da8853775
2
.gitignore
vendored
2
.gitignore
vendored
@ -166,4 +166,4 @@ tags
|
||||
.DS_Store
|
||||
|
||||
# ruff
|
||||
.ruff_cache
|
||||
.ruff_cache
|
||||
|
2
setup.py
2
setup.py
@ -172,7 +172,7 @@ _deps = [
|
||||
"tf2onnx",
|
||||
"timeout-decorator",
|
||||
"timm",
|
||||
"tokenizers>=0.11.1,!=0.11.3,<0.14",
|
||||
"tokenizers>=0.14,<0.15",
|
||||
"torch>=1.10,!=1.12.0",
|
||||
"torchaudio",
|
||||
"torchvision",
|
||||
|
@ -78,7 +78,7 @@ deps = {
|
||||
"tf2onnx": "tf2onnx",
|
||||
"timeout-decorator": "timeout-decorator",
|
||||
"timm": "timm",
|
||||
"tokenizers": "tokenizers>=0.11.1,!=0.11.3,<0.14",
|
||||
"tokenizers": "tokenizers>=0.14,<0.15",
|
||||
"torch": "torch>=1.10,!=1.12.0",
|
||||
"torchaudio": "torchaudio",
|
||||
"torchvision": "torchvision",
|
||||
|
@ -159,6 +159,14 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
||||
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
self.do_lower_case = do_lower_case
|
||||
self.remove_space = remove_space
|
||||
self.keep_accents = keep_accents
|
||||
self.vocab_file = vocab_file
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(vocab_file)
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
remove_space=remove_space,
|
||||
@ -174,14 +182,6 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.do_lower_case = do_lower_case
|
||||
self.remove_space = remove_space
|
||||
self.keep_accents = keep_accents
|
||||
self.vocab_file = vocab_file
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(vocab_file)
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> int:
|
||||
return len(self.sp_model)
|
||||
@ -228,6 +228,8 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
||||
new_pieces = []
|
||||
for piece in pieces:
|
||||
if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
|
||||
# Logic to handle special cases see https://github.com/google-research/bert/blob/master/README.md#tokenization
|
||||
# `9,9` -> ['▁9', ',', '9'] instead of [`_9,`, '9']
|
||||
cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
|
||||
if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
|
||||
if len(cur_pieces[0]) == 1:
|
||||
|
@ -204,21 +204,10 @@ class BartTokenizer(PreTrainedTokenizer):
|
||||
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
||||
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
# TODO seems like both slow and fast actually don't strip left and right soooooooo yeah. See `test_embeded_special_tokens`
|
||||
# Also this not only will strip the spaces but any punctuation
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
@ -235,6 +224,19 @@ class BartTokenizer(PreTrainedTokenizer):
|
||||
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
|
||||
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
|
||||
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return len(self.encoder)
|
||||
|
@ -170,6 +170,7 @@ class BartTokenizerFast(PreTrainedTokenizerFast):
|
||||
trim_offsets=True,
|
||||
**kwargs,
|
||||
):
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
merges_file,
|
||||
|
@ -47,6 +47,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
|
||||
SPIECE_UNDERLINE = "▁"
|
||||
|
||||
# TODO this class is useless. This is the most standard sentencpiece model. Let's find which one is closest and nuke this.
|
||||
|
||||
|
||||
class BarthezTokenizer(PreTrainedTokenizer):
|
||||
"""
|
||||
@ -141,6 +143,9 @@ class BarthezTokenizer(PreTrainedTokenizer):
|
||||
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
self.vocab_file = vocab_file
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(str(vocab_file))
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
@ -153,15 +158,6 @@ class BarthezTokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_file = vocab_file
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(str(vocab_file))
|
||||
|
||||
self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
|
||||
|
||||
self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) - 1
|
||||
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
|
||||
|
||||
def build_inputs_with_special_tokens(
|
||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||
) -> List[int]:
|
||||
@ -251,16 +247,10 @@ class BarthezTokenizer(PreTrainedTokenizer):
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
"""Converts a token (str) in an id using the vocab."""
|
||||
if token in self.fairseq_tokens_to_ids:
|
||||
return self.fairseq_tokens_to_ids[token]
|
||||
spm_id = self.sp_model.PieceToId(token)
|
||||
|
||||
return spm_id if spm_id else self.unk_token_id
|
||||
return self.sp_model.PieceToId(token)
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
if index in self.fairseq_ids_to_tokens:
|
||||
return self.fairseq_ids_to_tokens[index]
|
||||
return self.sp_model.IdToPiece(index)
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
|
@ -139,18 +139,6 @@ class BartphoTokenizer(PreTrainedTokenizer):
|
||||
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_file = vocab_file
|
||||
self.monolingual_vocab_file = monolingual_vocab_file
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
@ -174,6 +162,18 @@ class BartphoTokenizer(PreTrainedTokenizer):
|
||||
|
||||
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def __getstate__(self):
|
||||
state = self.__dict__.copy()
|
||||
state["sp_model"] = None
|
||||
|
@ -196,20 +196,6 @@ class BertTokenizer(PreTrainedTokenizer):
|
||||
strip_accents=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
@ -225,7 +211,22 @@ class BertTokenizer(PreTrainedTokenizer):
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
||||
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def do_lower_case(self):
|
||||
|
@ -96,6 +96,11 @@ class BertGenerationTokenizer(PreTrainedTokenizer):
|
||||
) -> None:
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
self.vocab_file = vocab_file
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(vocab_file)
|
||||
|
||||
# Add extra_ids to the special token list
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
@ -107,11 +112,6 @@ class BertGenerationTokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_file = vocab_file
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(vocab_file)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return self.sp_model.get_piece_size()
|
||||
|
@ -160,25 +160,6 @@ class BertJapaneseTokenizer(PreTrainedTokenizer):
|
||||
jumanpp_kwargs=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
spm_file=spm_file,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
do_lower_case=do_lower_case,
|
||||
do_word_tokenize=do_word_tokenize,
|
||||
do_subword_tokenize=do_subword_tokenize,
|
||||
word_tokenizer_type=word_tokenizer_type,
|
||||
subword_tokenizer_type=subword_tokenizer_type,
|
||||
never_split=never_split,
|
||||
mecab_kwargs=mecab_kwargs,
|
||||
sudachi_kwargs=sudachi_kwargs,
|
||||
jumanpp_kwargs=jumanpp_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if subword_tokenizer_type == "sentencepiece":
|
||||
if not os.path.isfile(spm_file):
|
||||
raise ValueError(
|
||||
@ -226,13 +207,31 @@ class BertJapaneseTokenizer(PreTrainedTokenizer):
|
||||
self.subword_tokenizer_type = subword_tokenizer_type
|
||||
if do_subword_tokenize:
|
||||
if subword_tokenizer_type == "wordpiece":
|
||||
self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
||||
self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
||||
elif subword_tokenizer_type == "character":
|
||||
self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
||||
self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
||||
elif subword_tokenizer_type == "sentencepiece":
|
||||
self.subword_tokenizer = SentencepieceTokenizer(vocab=self.spm_file, unk_token=self.unk_token)
|
||||
self.subword_tokenizer = SentencepieceTokenizer(vocab=self.spm_file, unk_token=str(unk_token))
|
||||
else:
|
||||
raise ValueError(f"Invalid subword_tokenizer_type '{subword_tokenizer_type}' is specified.")
|
||||
super().__init__(
|
||||
spm_file=spm_file,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
do_lower_case=do_lower_case,
|
||||
do_word_tokenize=do_word_tokenize,
|
||||
do_subword_tokenize=do_subword_tokenize,
|
||||
word_tokenizer_type=word_tokenizer_type,
|
||||
subword_tokenizer_type=subword_tokenizer_type,
|
||||
never_split=never_split,
|
||||
mecab_kwargs=mecab_kwargs,
|
||||
sudachi_kwargs=sudachi_kwargs,
|
||||
jumanpp_kwargs=jumanpp_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def do_lower_case(self):
|
||||
|
@ -134,18 +134,6 @@ class BertweetTokenizer(PreTrainedTokenizer):
|
||||
mask_token="<mask>",
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
normalization=normalization,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
try:
|
||||
from emoji import demojize
|
||||
|
||||
@ -161,10 +149,10 @@ class BertweetTokenizer(PreTrainedTokenizer):
|
||||
self.merges_file = merges_file
|
||||
|
||||
self.encoder = {}
|
||||
self.encoder[self.bos_token] = 0
|
||||
self.encoder[self.pad_token] = 1
|
||||
self.encoder[self.eos_token] = 2
|
||||
self.encoder[self.unk_token] = 3
|
||||
self.encoder[bos_token] = 0
|
||||
self.encoder[pad_token] = 1
|
||||
self.encoder[eos_token] = 2
|
||||
self.encoder[unk_token] = 3
|
||||
|
||||
self.add_from_file(vocab_file)
|
||||
|
||||
@ -178,9 +166,20 @@ class BertweetTokenizer(PreTrainedTokenizer):
|
||||
|
||||
self.normalization = normalization
|
||||
self.tweetPreprocessor = TweetTokenizer()
|
||||
|
||||
self.special_puncts = {"’": "'", "…": "..."}
|
||||
|
||||
super().__init__(
|
||||
normalization=normalization,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def build_inputs_with_special_tokens(
|
||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||
) -> List[int]:
|
||||
|
@ -127,6 +127,11 @@ class BigBirdTokenizer(PreTrainedTokenizer):
|
||||
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
self.vocab_file = vocab_file
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(vocab_file)
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
@ -139,11 +144,6 @@ class BigBirdTokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_file = vocab_file
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(vocab_file)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return self.sp_model.get_piece_size()
|
||||
|
@ -112,15 +112,6 @@ class BioGptTokenizer(PreTrainedTokenizer):
|
||||
pad_token="<pad>",
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
sep_token=sep_token,
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
try:
|
||||
import sacremoses
|
||||
except ImportError:
|
||||
@ -145,6 +136,15 @@ class BioGptTokenizer(PreTrainedTokenizer):
|
||||
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
||||
self.cache = {}
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
sep_token=sep_token,
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
"""Returns vocab size"""
|
||||
|
@ -187,28 +187,21 @@ class BlenderbotTokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
):
|
||||
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
|
||||
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
||||
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
||||
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
||||
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
|
||||
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
|
||||
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
||||
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
||||
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
**kwargs,
|
||||
mask_token = (
|
||||
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
|
||||
if isinstance(mask_token, str)
|
||||
else mask_token
|
||||
)
|
||||
|
||||
# these special tokens are not part of the vocab.json, let's add them in the correct order
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
@ -225,6 +218,19 @@ class BlenderbotTokenizer(PreTrainedTokenizer):
|
||||
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
|
||||
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
|
||||
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.vocab_size with Roberta->Blenderbot, RoBERTa->Blenderbot
|
||||
def vocab_size(self):
|
||||
@ -232,7 +238,9 @@ class BlenderbotTokenizer(PreTrainedTokenizer):
|
||||
|
||||
# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_vocab with Roberta->Blenderbot, RoBERTa->Blenderbot
|
||||
def get_vocab(self):
|
||||
return dict(self.encoder, **self.added_tokens_encoder)
|
||||
vocab = dict(self.encoder).copy()
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.bpe with Roberta->Blenderbot, RoBERTa->Blenderbot
|
||||
def bpe(self, token):
|
||||
|
@ -149,6 +149,11 @@ class BlenderbotTokenizerFast(PreTrainedTokenizerFast):
|
||||
trim_offsets=True,
|
||||
**kwargs,
|
||||
):
|
||||
mask_token = (
|
||||
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
|
||||
if isinstance(mask_token, str)
|
||||
else mask_token
|
||||
)
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
merges_file,
|
||||
|
@ -106,8 +106,6 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
|
||||
pad_token="__null__",
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, **kwargs)
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
@ -116,6 +114,7 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
|
||||
merges = [tuple(merge.split()) for merge in merges]
|
||||
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
||||
self.cache = {}
|
||||
super().__init__(unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, **kwargs)
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> int:
|
||||
|
@ -16,7 +16,7 @@
|
||||
|
||||
|
||||
import warnings
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
|
||||
from ...utils import logging
|
||||
@ -72,7 +72,7 @@ class ByT5Tokenizer(PreTrainedTokenizer):
|
||||
# Add extra_ids to the special token list
|
||||
if extra_ids > 0 and additional_special_tokens is None:
|
||||
additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
|
||||
elif extra_ids > 0 and additional_special_tokens is not None:
|
||||
elif extra_ids > 0 and additional_special_tokens is not None and len(additional_special_tokens) > 0:
|
||||
# Check that we have the right number of extra_id special tokens
|
||||
extra_tokens = len(set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens)))
|
||||
if extra_tokens != extra_ids:
|
||||
@ -82,38 +82,31 @@ class ByT5Tokenizer(PreTrainedTokenizer):
|
||||
" extra_ids tokens"
|
||||
)
|
||||
|
||||
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
||||
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
||||
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
||||
|
||||
pad_token = AddedToken(pad_token, lstrip=True, rstrip=True) if isinstance(pad_token, str) else pad_token
|
||||
# we force left and right stripping for backward compatibility. The byt5tests depend on this.
|
||||
eos_token = AddedToken(eos_token, lstrip=True, rstrip=True) if isinstance(eos_token, str) else eos_token
|
||||
unk_token = AddedToken(unk_token, lstrip=True, rstrip=True) if isinstance(unk_token, str) else unk_token
|
||||
# unk token needs to be in the vocab with correct index
|
||||
self._added_tokens_decoder = {0: pad_token, 1: eos_token, 2: unk_token}
|
||||
self.offset = len(self._added_tokens_decoder)
|
||||
self._utf_vocab_size = 2**8 # utf is 8 bits
|
||||
super().__init__(
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
extra_ids=extra_ids,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
extra_ids=0,
|
||||
additional_special_tokens=additional_special_tokens, # TODO extra ids are not used :sweatywmile:
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self._extra_ids = extra_ids
|
||||
|
||||
self._utf_vocab_size = 2**8 # utf is 8 bits
|
||||
|
||||
# define special tokens dict
|
||||
self.special_tokens_encoder: Dict[int, str] = {
|
||||
self.pad_token: 0,
|
||||
self.eos_token: 1,
|
||||
self.unk_token: 2,
|
||||
}
|
||||
self._num_special_tokens = len(self.special_tokens_encoder)
|
||||
n = len(additional_special_tokens)
|
||||
for i, token in enumerate(additional_special_tokens):
|
||||
self.special_tokens_encoder[token] = self.vocab_size + i - n
|
||||
self.special_tokens_decoder: Dict[str, int] = {v: k for k, v in self.special_tokens_encoder.items()}
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return self._utf_vocab_size + self._num_special_tokens + self._extra_ids
|
||||
return self._utf_vocab_size
|
||||
|
||||
def get_vocab(self):
|
||||
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def get_special_tokens_mask(
|
||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
||||
@ -209,34 +202,25 @@ class ByT5Tokenizer(PreTrainedTokenizer):
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
"""Converts a token (str) in an id using the vocab."""
|
||||
if token in self.special_tokens_encoder:
|
||||
token_id = self.special_tokens_encoder[token]
|
||||
elif token in self.added_tokens_encoder:
|
||||
token_id = self.added_tokens_encoder[token]
|
||||
elif len(token) != 1:
|
||||
token_id = self.unk_token_id
|
||||
|
||||
if len(token) != 1:
|
||||
token_id = None
|
||||
else:
|
||||
token_id = ord(token) + self._num_special_tokens
|
||||
token_id = ord(token) + self.offset
|
||||
|
||||
return token_id
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
if index in self.special_tokens_decoder:
|
||||
token = self.special_tokens_decoder[index]
|
||||
else:
|
||||
token = chr(index - self._num_special_tokens)
|
||||
token = chr(index - self.offset)
|
||||
return token
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
"""Converts a sequence of tokens (string) in a single string."""
|
||||
bstring = b""
|
||||
for token in tokens:
|
||||
if token in self.special_tokens_decoder:
|
||||
tok_string = self.special_tokens_decoder[token].encode("utf-8")
|
||||
elif token in self.added_tokens_decoder:
|
||||
tok_string = self.special_tokens_decoder[token].encode("utf-8")
|
||||
elif token in self.special_tokens_encoder:
|
||||
tok_string = token.encode("utf-8")
|
||||
if token in self.added_tokens_decoder:
|
||||
tok_string = self.added_tokens_decoder[token].encode("utf-8")
|
||||
elif token in self.added_tokens_encoder:
|
||||
tok_string = token.encode("utf-8")
|
||||
else:
|
||||
|
@ -136,6 +136,29 @@ class CamembertTokenizer(PreTrainedTokenizer):
|
||||
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(str(vocab_file))
|
||||
self.vocab_file = vocab_file
|
||||
|
||||
# HACK: These tokens were added by the author for an obscure reason as they were already part of the
|
||||
# sentencepiece vocabulary (this is the case for <s> and </s> and <unk>).
|
||||
# In this case it is recommended to properly set the tokens by hand.
|
||||
self._added_tokens_decoder = {
|
||||
0: AddedToken("<s>NOTUSED"),
|
||||
1: AddedToken(pad_token),
|
||||
2: AddedToken("</s>NOTUSED"),
|
||||
3: AddedToken(unk_token),
|
||||
4: AddedToken("<unk>NOTUSED"),
|
||||
}
|
||||
|
||||
self.fairseq_offset = 4 # 3 tokens are newly added, but the offset starts from 4
|
||||
|
||||
# legacy: camemebert is a particular case were we have to make sure `"<unk>NOTUSED"` is here
|
||||
if "added_tokens_decoder" in kwargs:
|
||||
# this is the only class that requires this unfortunately.....
|
||||
# the reason is that the fast version has a whole.
|
||||
kwargs["added_tokens_decoder"].update(self._added_tokens_decoder)
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
@ -148,15 +171,83 @@ class CamembertTokenizer(PreTrainedTokenizer):
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
# The length of the vocabulary without added tokens is len(self.sp_model) but the added tokens are added at the beginning.
|
||||
return len(self.sp_model)
|
||||
|
||||
def get_vocab(self):
|
||||
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.fairseq_offset)}
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def _tokenize(self, text: str) -> List[str]:
|
||||
return self.sp_model.encode(text, out_type=str)
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
"""Converts a token (str) in an id using the vocab."""
|
||||
# specifi to camembert, both 3 and 4 point to the unk token.
|
||||
if self.sp_model.PieceToId(token) == 0:
|
||||
# Convert sentence piece unk token to fairseq unk token index
|
||||
return self.unk_token_id
|
||||
return self.fairseq_offset + self.sp_model.PieceToId(token)
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
return self.sp_model.IdToPiece(index - self.fairseq_offset)
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
"""Converts a sequence of tokens (string) in a single string."""
|
||||
# TODO decode outputs do not match between fast and slow
|
||||
current_sub_tokens = []
|
||||
out_string = ""
|
||||
prev_is_special = False
|
||||
for token in tokens:
|
||||
# make sure that special tokens are not decoded using sentencepiece model
|
||||
if token in self.all_special_tokens:
|
||||
if not prev_is_special:
|
||||
out_string += " "
|
||||
out_string += self.sp_model.decode(current_sub_tokens) + token
|
||||
prev_is_special = True
|
||||
current_sub_tokens = []
|
||||
else:
|
||||
current_sub_tokens.append(token)
|
||||
prev_is_special = False
|
||||
out_string += self.sp_model.decode(current_sub_tokens)
|
||||
return out_string.strip()
|
||||
|
||||
def __getstate__(self):
|
||||
state = self.__dict__.copy()
|
||||
state["sp_model"] = None
|
||||
return state
|
||||
|
||||
def __setstate__(self, d):
|
||||
self.__dict__ = d
|
||||
|
||||
# for backward compatibility
|
||||
if not hasattr(self, "sp_model_kwargs"):
|
||||
self.sp_model_kwargs = {}
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(str(vocab_file))
|
||||
self.vocab_file = vocab_file
|
||||
# HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual
|
||||
# sentencepiece vocabulary (this is the case for <s> and </s>
|
||||
self.fairseq_tokens_to_ids = {"<s>NOTUSED": 0, "<pad>": 1, "</s>NOTUSED": 2, "<unk>": 3}
|
||||
self.fairseq_offset = len(self.fairseq_tokens_to_ids)
|
||||
self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
|
||||
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
|
||||
self.sp_model.Load(self.vocab_file)
|
||||
|
||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||
if not os.path.isdir(save_directory):
|
||||
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
||||
return
|
||||
out_vocab_file = os.path.join(
|
||||
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
|
||||
)
|
||||
|
||||
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
|
||||
copyfile(self.vocab_file, out_vocab_file)
|
||||
elif not os.path.isfile(self.vocab_file):
|
||||
with open(out_vocab_file, "wb") as fi:
|
||||
content_spiece_model = self.sp_model.serialized_model_proto()
|
||||
fi.write(content_spiece_model)
|
||||
|
||||
return (out_vocab_file,)
|
||||
|
||||
def build_inputs_with_special_tokens(
|
||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||
@ -233,81 +324,3 @@ class CamembertTokenizer(PreTrainedTokenizer):
|
||||
if token_ids_1 is None:
|
||||
return len(cls + token_ids_0 + sep) * [0]
|
||||
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return len(self.fairseq_tokens_to_ids) + len(self.sp_model)
|
||||
|
||||
def get_vocab(self):
|
||||
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def _tokenize(self, text: str) -> List[str]:
|
||||
return self.sp_model.encode(text, out_type=str)
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
"""Converts a token (str) in an id using the vocab."""
|
||||
if token in self.fairseq_tokens_to_ids:
|
||||
return self.fairseq_tokens_to_ids[token]
|
||||
elif self.sp_model.PieceToId(token) == 0:
|
||||
# Convert sentence piece unk token to fairseq unk token index
|
||||
return self.unk_token_id
|
||||
return self.fairseq_offset + self.sp_model.PieceToId(token)
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
if index in self.fairseq_ids_to_tokens:
|
||||
return self.fairseq_ids_to_tokens[index]
|
||||
return self.sp_model.IdToPiece(index - self.fairseq_offset)
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
"""Converts a sequence of tokens (string) in a single string."""
|
||||
current_sub_tokens = []
|
||||
out_string = ""
|
||||
prev_is_special = False
|
||||
for token in tokens:
|
||||
# make sure that special tokens are not decoded using sentencepiece model
|
||||
if token in self.all_special_tokens:
|
||||
if not prev_is_special:
|
||||
out_string += " "
|
||||
out_string += self.sp_model.decode(current_sub_tokens) + token
|
||||
prev_is_special = True
|
||||
current_sub_tokens = []
|
||||
else:
|
||||
current_sub_tokens.append(token)
|
||||
prev_is_special = False
|
||||
out_string += self.sp_model.decode(current_sub_tokens)
|
||||
return out_string.strip()
|
||||
|
||||
def __getstate__(self):
|
||||
state = self.__dict__.copy()
|
||||
state["sp_model"] = None
|
||||
return state
|
||||
|
||||
def __setstate__(self, d):
|
||||
self.__dict__ = d
|
||||
|
||||
# for backward compatibility
|
||||
if not hasattr(self, "sp_model_kwargs"):
|
||||
self.sp_model_kwargs = {}
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(self.vocab_file)
|
||||
|
||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||
if not os.path.isdir(save_directory):
|
||||
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
||||
return
|
||||
out_vocab_file = os.path.join(
|
||||
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
|
||||
)
|
||||
|
||||
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
|
||||
copyfile(self.vocab_file, out_vocab_file)
|
||||
elif not os.path.isfile(self.vocab_file):
|
||||
with open(out_vocab_file, "wb") as fi:
|
||||
content_spiece_model = self.sp_model.serialized_model_proto()
|
||||
fi.write(content_spiece_model)
|
||||
|
||||
return (out_vocab_file,)
|
||||
|
@ -33,7 +33,6 @@ UNICODE_VOCAB_SIZE = 1114112
|
||||
# Below: Constants defining canonical codepoints for special, pseudo-characters.
|
||||
# Copied from https://github.com/google-research/language/blob/master/language/canine/special_codepoints.py
|
||||
PAD = 0
|
||||
|
||||
CLS = 0xE000
|
||||
SEP = 0xE001
|
||||
BOS = 0xE002
|
||||
@ -97,18 +96,6 @@ class CanineTokenizer(PreTrainedTokenizer):
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
model_max_length=model_max_length,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Creates a mapping for looking up the IDs of special symbols.
|
||||
self._special_codepoints: Dict[str, int] = {}
|
||||
for codepoint, name in SPECIAL_CODEPOINTS.items():
|
||||
@ -122,10 +109,27 @@ class CanineTokenizer(PreTrainedTokenizer):
|
||||
self._unicode_vocab_size = UNICODE_VOCAB_SIZE
|
||||
self._num_special_tokens = len(self._special_codepoints)
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
model_max_length=model_max_length,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> int:
|
||||
return self._unicode_vocab_size
|
||||
|
||||
def get_vocab(self):
|
||||
vocab = {chr(i): i for i in range(self.vocab_size)}
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def _tokenize(self, text: str) -> List[str]:
|
||||
"""Tokenize a string (i.e. perform character splitting)."""
|
||||
return list(text)
|
||||
|
@ -312,16 +312,6 @@ class CLIPTokenizer(PreTrainedTokenizer):
|
||||
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
|
||||
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
||||
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
||||
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
pad_token=pad_token,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
try:
|
||||
import ftfy
|
||||
|
||||
@ -348,6 +338,15 @@ class CLIPTokenizer(PreTrainedTokenizer):
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
pad_token=pad_token,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return len(self.encoder)
|
||||
|
@ -151,6 +151,17 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
|
||||
for token in [prefix_token, middle_token, suffix_token, eot_token]:
|
||||
additional_special_tokens += [token] if token is not None else []
|
||||
|
||||
self.vocab_file = vocab_file
|
||||
self.add_bos_token = add_bos_token
|
||||
self.add_eos_token = add_eos_token
|
||||
self._prefix_token = prefix_token
|
||||
self._middle_token = middle_token
|
||||
self._suffix_token = suffix_token
|
||||
self._eot_token = eot_token
|
||||
self.fill_token = fill_token
|
||||
self.suffix_first = suffix_first
|
||||
self.sp_model = self.get_spm_processor()
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
@ -169,16 +180,6 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
|
||||
use_default_system_prompt=use_default_system_prompt,
|
||||
**kwargs,
|
||||
)
|
||||
self.vocab_file = vocab_file
|
||||
self.add_bos_token = add_bos_token
|
||||
self.add_eos_token = add_eos_token
|
||||
self._prefix_token = prefix_token
|
||||
self._middle_token = middle_token
|
||||
self._suffix_token = suffix_token
|
||||
self._eot_token = eot_token
|
||||
self.fill_token = fill_token
|
||||
self.suffix_first = suffix_first
|
||||
self.sp_model = self.get_spm_processor()
|
||||
|
||||
@property
|
||||
def unk_token_length(self):
|
||||
|
@ -167,16 +167,6 @@ class CodeGenTokenizer(PreTrainedTokenizer):
|
||||
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
||||
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
||||
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
pad_token=pad_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
add_bos_token=add_bos_token,
|
||||
**kwargs,
|
||||
)
|
||||
self.add_bos_token = add_bos_token
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
@ -194,6 +184,16 @@ class CodeGenTokenizer(PreTrainedTokenizer):
|
||||
|
||||
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
|
||||
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
pad_token=pad_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
add_bos_token=add_bos_token,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
|
@ -135,20 +135,6 @@ class ConvBertTokenizer(PreTrainedTokenizer):
|
||||
strip_accents=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
@ -164,7 +150,22 @@ class ConvBertTokenizer(PreTrainedTokenizer):
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
||||
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def do_lower_case(self):
|
||||
|
@ -38,6 +38,9 @@ PRETRAINED_VOCAB_FILES_MAP = {
|
||||
class CpmTokenizer(PreTrainedTokenizer):
|
||||
"""Runs pre-tokenization with Jieba segmentation tool. It is used in CPM models."""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file,
|
||||
@ -121,24 +124,6 @@ class CpmTokenizer(PreTrainedTokenizer):
|
||||
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
remove_space=remove_space,
|
||||
keep_accents=keep_accents,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self._pad_token_type_id = 3
|
||||
|
||||
self.do_lower_case = do_lower_case
|
||||
self.remove_space = remove_space
|
||||
self.keep_accents = keep_accents
|
||||
@ -157,6 +142,24 @@ class CpmTokenizer(PreTrainedTokenizer):
|
||||
self.jieba = jieba
|
||||
self.translator = str.maketrans(" \n", "\u2582\u2583")
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
remove_space=remove_space,
|
||||
keep_accents=keep_accents,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self._pad_token_type_id = 3
|
||||
|
||||
@property
|
||||
# Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.vocab_size
|
||||
def vocab_size(self):
|
||||
|
@ -131,18 +131,6 @@ class CpmAntTokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
):
|
||||
requires_backends(self, ["jieba"])
|
||||
super().__init__(
|
||||
bod_token=bod_token,
|
||||
eod_token=eod_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
pad_token=pad_token,
|
||||
unk_token=unk_token,
|
||||
line_token=line_token,
|
||||
space_token=space_token,
|
||||
padding_side=padding_side,
|
||||
**kwargs,
|
||||
)
|
||||
self.bod_token = bod_token
|
||||
self.eod_token = eod_token
|
||||
self.encoder = load_vocab(vocab_file)
|
||||
@ -155,7 +143,20 @@ class CpmAntTokenizer(PreTrainedTokenizer):
|
||||
self.encoder = collections.OrderedDict(sorted(self.encoder.items(), key=lambda x: x[1]))
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.encoder, unk_token=self.unk_token)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.encoder, unk_token=unk_token)
|
||||
|
||||
super().__init__(
|
||||
bod_token=bod_token,
|
||||
eod_token=eod_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
pad_token=pad_token,
|
||||
unk_token=unk_token,
|
||||
line_token=line_token,
|
||||
space_token=space_token,
|
||||
padding_side=padding_side,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def bod_token_id(self):
|
||||
|
@ -139,8 +139,6 @@ class CTRLTokenizer(PreTrainedTokenizer):
|
||||
control_codes = CONTROL_CODES
|
||||
|
||||
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
|
||||
super().__init__(unk_token=unk_token, **kwargs)
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
@ -149,6 +147,7 @@ class CTRLTokenizer(PreTrainedTokenizer):
|
||||
merges = [tuple(merge.split()) for merge in merges]
|
||||
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
||||
self.cache = {}
|
||||
super().__init__(unk_token=unk_token, **kwargs)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
|
@ -201,20 +201,6 @@ class DebertaTokenizer(PreTrainedTokenizer):
|
||||
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
add_bos_token=add_bos_token,
|
||||
**kwargs,
|
||||
)
|
||||
self.add_bos_token = add_bos_token
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
@ -233,6 +219,20 @@ class DebertaTokenizer(PreTrainedTokenizer):
|
||||
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
|
||||
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
|
||||
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
add_bos_token=add_bos_token,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
# Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.vocab_size
|
||||
def vocab_size(self):
|
||||
|
@ -20,9 +20,12 @@ from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import sentencepiece as sp
|
||||
|
||||
from ...tokenization_utils import PreTrainedTokenizer
|
||||
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
PRETRAINED_VOCAB_FILES_MAP = {
|
||||
"vocab_file": {
|
||||
"microsoft/deberta-v2-xlarge": "https://huggingface.co/microsoft/deberta-v2-xlarge/resolve/main/spm.model",
|
||||
@ -124,6 +127,18 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
|
||||
) -> None:
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
" model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
|
||||
)
|
||||
self.do_lower_case = do_lower_case
|
||||
self.split_by_punct = split_by_punct
|
||||
self.vocab_file = vocab_file
|
||||
self._tokenizer = SPMTokenizer(
|
||||
vocab_file, None, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs
|
||||
)
|
||||
unk_token = AddedToken(unk_token, normalized=True, lstrip=False, rstrip=False)
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
bos_token=bos_token,
|
||||
@ -137,18 +152,7 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
" model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
|
||||
)
|
||||
self.do_lower_case = do_lower_case
|
||||
self.split_by_punct = split_by_punct
|
||||
self.vocab_file = vocab_file
|
||||
self._tokenizer = SPMTokenizer(
|
||||
vocab_file, self.all_special_tokens, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs
|
||||
)
|
||||
self._tokenizer.special_tokens = self.all_special_tokens
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
@ -374,6 +378,7 @@ class SPMTokenizer:
|
||||
text = "".join(words[word_start:word_end])
|
||||
return text
|
||||
|
||||
# TODO add a deprecation cycle as this can have different behaviour from our API
|
||||
def add_special_token(self, token):
|
||||
if token not in self.special_tokens:
|
||||
self.special_tokens.append(token)
|
||||
@ -383,6 +388,9 @@ class SPMTokenizer:
|
||||
return self.id(token)
|
||||
|
||||
def part_of_whole_word(self, token, is_bos=False):
|
||||
logger.warning_once(
|
||||
"The `DebertaTokenizer.part_of_whole_word` method is deprecated and will be removed in `transformers==4.35`"
|
||||
)
|
||||
if is_bos:
|
||||
return True
|
||||
if (
|
||||
@ -413,6 +421,9 @@ class SPMTokenizer:
|
||||
return self.ids_to_tokens[id]
|
||||
|
||||
def id(self, sym):
|
||||
logger.warning_once(
|
||||
"The `DebertaTokenizer.id` method is deprecated and will be removed in `transformers==4.35`"
|
||||
)
|
||||
return self.vocab[sym] if sym in self.vocab else 1
|
||||
|
||||
def _encode_as_pieces(self, text):
|
||||
@ -460,17 +471,6 @@ class SPMTokenizer:
|
||||
|
||||
return words
|
||||
|
||||
def _run_strip_accents(self, text):
|
||||
"""Strips accents from a piece of text."""
|
||||
text = unicodedata.normalize("NFD", text)
|
||||
output = []
|
||||
for char in text:
|
||||
cat = unicodedata.category(char)
|
||||
if cat == "Mn":
|
||||
continue
|
||||
output.append(char)
|
||||
return "".join(output)
|
||||
|
||||
def _run_split_on_punc(self, text):
|
||||
"""Splits punctuation on a piece of text."""
|
||||
chars = list(text)
|
||||
|
@ -132,20 +132,6 @@ class RetriBertTokenizer(PreTrainedTokenizer):
|
||||
strip_accents=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
@ -161,7 +147,22 @@ class RetriBertTokenizer(PreTrainedTokenizer):
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
||||
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case
|
||||
|
@ -296,23 +296,6 @@ class TapexTokenizer(PreTrainedTokenizer):
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
vocab_file=vocab_file,
|
||||
merges_file=merges_file,
|
||||
do_lower_case=do_lower_case,
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
max_cell_length=max_cell_length,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
@ -331,6 +314,24 @@ class TapexTokenizer(PreTrainedTokenizer):
|
||||
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
|
||||
|
||||
# additional properties
|
||||
|
||||
super().__init__(
|
||||
vocab_file=vocab_file,
|
||||
merges_file=merges_file,
|
||||
do_lower_case=do_lower_case,
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
max_cell_length=max_cell_length,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.max_cell_length = max_cell_length
|
||||
self.table_linearize = IndexedRowTableLinearize()
|
||||
|
||||
|
@ -149,20 +149,6 @@ class DistilBertTokenizer(PreTrainedTokenizer):
|
||||
strip_accents=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
@ -178,7 +164,21 @@ class DistilBertTokenizer(PreTrainedTokenizer):
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case
|
||||
|
@ -152,20 +152,6 @@ class ElectraTokenizer(PreTrainedTokenizer):
|
||||
strip_accents=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
@ -181,7 +167,22 @@ class ElectraTokenizer(PreTrainedTokenizer):
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
||||
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def do_lower_case(self):
|
||||
|
@ -112,6 +112,19 @@ class ErnieMTokenizer(PreTrainedTokenizer):
|
||||
# is included in the raw text, there should be a match in a non-normalized sentence.
|
||||
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
self.do_lower_case = do_lower_case
|
||||
self.sentencepiece_model_ckpt = sentencepiece_model_ckpt
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(sentencepiece_model_ckpt)
|
||||
|
||||
# to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning
|
||||
if vocab_file is not None:
|
||||
self.vocab = self.load_vocab(filepath=vocab_file)
|
||||
else:
|
||||
self.vocab = {self.sp_model.id_to_piece(id): id for id in range(self.sp_model.get_piece_size())}
|
||||
self.reverse_vocab = {v: k for k, v in self.vocab.items()}
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
unk_token=unk_token,
|
||||
@ -124,17 +137,6 @@ class ErnieMTokenizer(PreTrainedTokenizer):
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
self.do_lower_case = do_lower_case
|
||||
self.sentencepiece_model_ckpt = sentencepiece_model_ckpt
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(sentencepiece_model_ckpt)
|
||||
|
||||
# to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning
|
||||
if vocab_file is not None:
|
||||
self.vocab = self.load_vocab(filepath=vocab_file)
|
||||
else:
|
||||
self.vocab = {self.sp_model.id_to_piece(id): id for id in range(self.sp_model.get_piece_size())}
|
||||
self.reverse_vocab = {v: k for k, v in self.vocab.items()}
|
||||
|
||||
def get_offset_mapping(self, text):
|
||||
if text is None:
|
||||
|
@ -64,17 +64,23 @@ class EsmTokenizer(PreTrainedTokenizer):
|
||||
eos_token="<eos>",
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.all_tokens = load_vocab_file(vocab_file)
|
||||
self._id_to_token = dict(enumerate(self.all_tokens))
|
||||
self._token_to_id = {tok: ind for ind, tok in enumerate(self.all_tokens)}
|
||||
self.unk_token = unk_token
|
||||
self.cls_token = cls_token
|
||||
self.pad_token = pad_token
|
||||
self.mask_token = mask_token
|
||||
self.eos_token = eos_token
|
||||
super().__init__(
|
||||
unk_token=unk_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
eos_token=eos_token,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# TODO, all the tokens are added? But they are also part of the vocab... bit strange.
|
||||
# none of them are special, but they all need special splitting.
|
||||
|
||||
self.unique_no_split_tokens = self.all_tokens
|
||||
self._create_trie(self.unique_no_split_tokens)
|
||||
self._update_trie(self.unique_no_split_tokens)
|
||||
|
||||
def _convert_id_to_token(self, index: int) -> str:
|
||||
return self._id_to_token.get(index, self.unk_token)
|
||||
|
@ -258,19 +258,6 @@ class FlaubertTokenizer(PreTrainedTokenizer):
|
||||
|
||||
self.do_lowercase = do_lowercase
|
||||
|
||||
super().__init__(
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
lang2id=lang2id,
|
||||
id2lang=id2lang,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
try:
|
||||
import sacremoses
|
||||
except ImportError:
|
||||
@ -303,6 +290,19 @@ class FlaubertTokenizer(PreTrainedTokenizer):
|
||||
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
||||
self.cache = {}
|
||||
|
||||
super().__init__(
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
lang2id=lang2id,
|
||||
id2lang=id2lang,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
# Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.do_lower_case
|
||||
def do_lower_case(self):
|
||||
|
@ -15,7 +15,6 @@
|
||||
""" Tokenization classes for FNet model."""
|
||||
|
||||
import os
|
||||
import re
|
||||
import unicodedata
|
||||
from shutil import copyfile
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
@ -117,14 +116,19 @@ class FNetTokenizer(PreTrainedTokenizer):
|
||||
) -> None:
|
||||
# Mask token behave like a normal word, i.e. include the space before it and
|
||||
# is included in the raw text, there should be a match in a non-normalized sentence.
|
||||
mask_token = (
|
||||
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
|
||||
if isinstance(mask_token, str)
|
||||
else mask_token
|
||||
)
|
||||
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
|
||||
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
self.do_lower_case = do_lower_case
|
||||
self.remove_space = remove_space
|
||||
self.keep_accents = keep_accents
|
||||
self.vocab_file = vocab_file
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(vocab_file)
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
remove_space=remove_space,
|
||||
@ -138,14 +142,6 @@ class FNetTokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.do_lower_case = do_lower_case
|
||||
self.remove_space = remove_space
|
||||
self.keep_accents = keep_accents
|
||||
self.vocab_file = vocab_file
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(vocab_file)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return len(self.sp_model)
|
||||
@ -237,48 +233,21 @@ class FNetTokenizer(PreTrainedTokenizer):
|
||||
token_ids: List[int],
|
||||
skip_special_tokens: bool = False,
|
||||
clean_up_tokenization_spaces: bool = None,
|
||||
spaces_between_special_tokens: bool = True,
|
||||
spaces_between_special_tokens: bool = False,
|
||||
**kwargs,
|
||||
) -> str:
|
||||
self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
|
||||
|
||||
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
|
||||
|
||||
# To avoid mixing byte-level and unicode for byte-level BPT
|
||||
# we need to build string separately for added tokens and byte-level tokens
|
||||
# cf. https://github.com/huggingface/transformers/issues/1133
|
||||
sub_texts = []
|
||||
current_sub_text = []
|
||||
for token in filtered_tokens:
|
||||
if skip_special_tokens and token in self.all_special_ids:
|
||||
continue
|
||||
if token in self.added_tokens_encoder:
|
||||
if current_sub_text:
|
||||
sub_texts.append(self.convert_tokens_to_string(current_sub_text))
|
||||
current_sub_text = []
|
||||
sub_texts.append(token)
|
||||
else:
|
||||
current_sub_text.append(token)
|
||||
if current_sub_text:
|
||||
sub_texts.append(self.convert_tokens_to_string(current_sub_text))
|
||||
|
||||
text = super()._decode(
|
||||
token_ids=token_ids,
|
||||
skip_special_tokens=skip_special_tokens,
|
||||
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||
spaces_between_special_tokens=spaces_between_special_tokens,
|
||||
**kwargs,
|
||||
)
|
||||
# Mimic the behavior of the Rust tokenizer:
|
||||
# No space after <unk>
|
||||
if spaces_between_special_tokens:
|
||||
text = re.sub(r"(<unk>) ", r"\1", " ".join(sub_texts))
|
||||
else:
|
||||
text = "".join(sub_texts)
|
||||
|
||||
clean_up_tokenization_spaces = (
|
||||
clean_up_tokenization_spaces
|
||||
if clean_up_tokenization_spaces is not None
|
||||
else self.clean_up_tokenization_spaces
|
||||
)
|
||||
if clean_up_tokenization_spaces:
|
||||
clean_text = self.clean_up_tokenization(text)
|
||||
return clean_text
|
||||
else:
|
||||
return text
|
||||
if not spaces_between_special_tokens:
|
||||
text = text.replace("<unk> ", "<unk>")
|
||||
return text
|
||||
|
||||
def build_inputs_with_special_tokens(
|
||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||
|
@ -108,11 +108,9 @@ class FNetTokenizerFast(PreTrainedTokenizerFast):
|
||||
):
|
||||
# Mask token behave like a normal word, i.e. include the space before it and
|
||||
# is included in the raw text, there should be a match in a non-normalized sentence.
|
||||
mask_token = (
|
||||
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
|
||||
if isinstance(mask_token, str)
|
||||
else mask_token
|
||||
)
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
|
||||
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
|
||||
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
|
@ -197,19 +197,6 @@ class FSMTTokenizer(PreTrainedTokenizer):
|
||||
pad_token="<pad>",
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
langs=langs,
|
||||
src_vocab_file=src_vocab_file,
|
||||
tgt_vocab_file=tgt_vocab_file,
|
||||
merges_file=merges_file,
|
||||
do_lower_case=do_lower_case,
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
try:
|
||||
import sacremoses
|
||||
except ImportError:
|
||||
@ -250,6 +237,18 @@ class FSMTTokenizer(PreTrainedTokenizer):
|
||||
merges = [tuple(merge.split()[:2]) for merge in merges]
|
||||
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
||||
self.cache = {}
|
||||
super().__init__(
|
||||
langs=langs,
|
||||
src_vocab_file=src_vocab_file,
|
||||
tgt_vocab_file=tgt_vocab_file,
|
||||
merges_file=merges_file,
|
||||
do_lower_case=do_lower_case,
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# hack override
|
||||
def get_vocab(self) -> Dict[str, int]:
|
||||
|
@ -157,22 +157,6 @@ class FunnelTokenizer(PreTrainedTokenizer):
|
||||
strip_accents=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
@ -188,7 +172,23 @@ class FunnelTokenizer(PreTrainedTokenizer):
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case
|
||||
|
@ -170,16 +170,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
||||
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
||||
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
pad_token=pad_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
add_bos_token=add_bos_token,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.add_bos_token = add_bos_token
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
@ -198,6 +189,17 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
|
||||
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
|
||||
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
pad_token=pad_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
add_bos_token=add_bos_token,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return len(self.encoder)
|
||||
|
@ -127,14 +127,6 @@ class GPTNeoXJapaneseTokenizer(PreTrainedTokenizer):
|
||||
do_clean_text=False,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
do_clean_text=do_clean_text,
|
||||
**kwargs,
|
||||
)
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
@ -150,6 +142,14 @@ class GPTNeoXJapaneseTokenizer(PreTrainedTokenizer):
|
||||
self.subword_tokenizer = SubWordJapaneseTokenizer(
|
||||
vocab=self.vocab, ids_to_tokens=self.ids_to_tokens, emoji=self.emoji
|
||||
)
|
||||
super().__init__(
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
do_clean_text=do_clean_text,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
|
@ -103,7 +103,7 @@ class GPTSw3Tokenizer(PreTrainedTokenizer):
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
model_input_names = ["input_ids", "attention_mask"]
|
||||
model_input_names = ["input_ids", "token_type_ids", "attention_mask"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -138,18 +138,6 @@ class GPTSw3Tokenizer(PreTrainedTokenizer):
|
||||
pad_token = "<pad>" if pad_token is None else pad_token
|
||||
bos_token = "<s>" if bos_token is None else bos_token
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
remove_space=remove_space,
|
||||
keep_accents=keep_accents,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.do_lower_case = do_lower_case
|
||||
self.remove_space = remove_space
|
||||
self.keep_accents = keep_accents
|
||||
@ -168,6 +156,18 @@ class GPTSw3Tokenizer(PreTrainedTokenizer):
|
||||
f"[{''.join(map(chr, list(range(0, 9)) + list(range(11, 32)) + list(range(127, 160)) + [160, 173, 8203]))}]"
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
remove_space=remove_space,
|
||||
keep_accents=keep_accents,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.__getstate__
|
||||
def __getstate__(self):
|
||||
state = self.__dict__.copy()
|
||||
|
@ -166,15 +166,6 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer):
|
||||
do_clean_text=False,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
sep_token=sep_token,
|
||||
do_clean_text=do_clean_text,
|
||||
**kwargs,
|
||||
)
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
@ -191,6 +182,16 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer):
|
||||
vocab=self.vocab, ids_to_tokens=self.ids_to_tokens, emoji=self.emoji
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
sep_token=sep_token,
|
||||
do_clean_text=do_clean_text,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
# Copied from tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer.vocab_size
|
||||
def vocab_size(self):
|
||||
|
@ -334,21 +334,6 @@ class HerbertTokenizer(PreTrainedTokenizer):
|
||||
id2lang=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
lang2id=lang2id,
|
||||
id2lang=id2lang,
|
||||
do_lowercase_and_remove_accent=do_lowercase_and_remove_accent,
|
||||
tokenizer_file=None,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
try:
|
||||
import sacremoses
|
||||
except ImportError:
|
||||
@ -383,6 +368,21 @@ class HerbertTokenizer(PreTrainedTokenizer):
|
||||
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
||||
self.cache = {}
|
||||
|
||||
super().__init__(
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
lang2id=lang2id,
|
||||
id2lang=id2lang,
|
||||
do_lowercase_and_remove_accent=do_lowercase_and_remove_accent,
|
||||
tokenizer_file=None,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.bert_pre_tokenizer = BasicTokenizer(
|
||||
do_lower_case=False,
|
||||
never_split=self.all_special_tokens,
|
||||
|
@ -128,16 +128,10 @@ class JukeboxTokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
):
|
||||
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
||||
super().__init__(
|
||||
unk_token=unk_token,
|
||||
n_genres=n_genres,
|
||||
version=version,
|
||||
max_n_lyric_tokens=max_n_lyric_tokens,
|
||||
**kwargs,
|
||||
)
|
||||
self.version = version
|
||||
self.max_n_lyric_tokens = max_n_lyric_tokens
|
||||
self.n_genres = n_genres
|
||||
self._added_tokens_decoder = {0: unk_token}
|
||||
|
||||
with open(artists_file, encoding="utf-8") as vocab_handle:
|
||||
self.artists_encoder = json.load(vocab_handle)
|
||||
@ -157,13 +151,24 @@ class JukeboxTokenizer(PreTrainedTokenizer):
|
||||
self.artists_decoder = {v: k for k, v in self.artists_encoder.items()}
|
||||
self.genres_decoder = {v: k for k, v in self.genres_encoder.items()}
|
||||
self.lyrics_decoder = {v: k for k, v in self.lyrics_encoder.items()}
|
||||
super().__init__(
|
||||
unk_token=unk_token,
|
||||
n_genres=n_genres,
|
||||
version=version,
|
||||
max_n_lyric_tokens=max_n_lyric_tokens,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return len(self.artists_encoder) + len(self.genres_encoder) + len(self.lyrics_encoder)
|
||||
|
||||
def get_vocab(self):
|
||||
return dict(self.artists_encoder, self.genres_encoder, self.lyrics_encoder)
|
||||
return {
|
||||
"artists_encoder": self.artists_encoder,
|
||||
"genres_encoder": self.genres_encoder,
|
||||
"lyrics_encoder": self.lyrics_encoder,
|
||||
}
|
||||
|
||||
def _convert_token_to_id(self, list_artists, list_genres, list_lyrics):
|
||||
"""Converts the artist, genre and lyrics tokens to their index using the vocabulary.
|
||||
|
@ -134,20 +134,6 @@ class LayoutLMTokenizer(PreTrainedTokenizer):
|
||||
strip_accents=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
@ -163,7 +149,22 @@ class LayoutLMTokenizer(PreTrainedTokenizer):
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
||||
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def do_lower_case(self):
|
||||
|
@ -244,6 +244,29 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
additional_special_tokens: Optional[List[str]] = None,
|
||||
**kwargs,
|
||||
):
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
" model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
|
||||
)
|
||||
self.vocab = load_vocab(vocab_file)
|
||||
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
|
||||
self.do_basic_tokenize = do_basic_tokenize
|
||||
if do_basic_tokenize:
|
||||
self.basic_tokenizer = BasicTokenizer(
|
||||
do_lower_case=do_lower_case,
|
||||
never_split=never_split,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
||||
|
||||
# additional properties
|
||||
self.cls_token_box = cls_token_box
|
||||
self.sep_token_box = sep_token_box
|
||||
self.pad_token_box = pad_token_box
|
||||
self.pad_token_label = pad_token_label
|
||||
self.only_label_first_subword = only_label_first_subword
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
@ -265,30 +288,6 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
" model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
|
||||
)
|
||||
self.vocab = load_vocab(vocab_file)
|
||||
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
|
||||
self.do_basic_tokenize = do_basic_tokenize
|
||||
if do_basic_tokenize:
|
||||
self.basic_tokenizer = BasicTokenizer(
|
||||
do_lower_case=do_lower_case,
|
||||
never_split=never_split,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
||||
|
||||
# additional properties
|
||||
self.cls_token_box = cls_token_box
|
||||
self.sep_token_box = sep_token_box
|
||||
self.pad_token_box = pad_token_box
|
||||
self.pad_token_label = pad_token_label
|
||||
self.only_label_first_subword = only_label_first_subword
|
||||
|
||||
@property
|
||||
def do_lower_case(self):
|
||||
return self.basic_tokenizer.do_lower_case
|
||||
|
@ -303,24 +303,6 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
cls_token_box=cls_token_box,
|
||||
sep_token_box=sep_token_box,
|
||||
pad_token_box=pad_token_box,
|
||||
pad_token_label=pad_token_label,
|
||||
only_label_first_subword=only_label_first_subword,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
@ -344,6 +326,24 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
self.pad_token_label = pad_token_label
|
||||
self.only_label_first_subword = only_label_first_subword
|
||||
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
cls_token_box=cls_token_box,
|
||||
sep_token_box=sep_token_box,
|
||||
pad_token_box=pad_token_box,
|
||||
pad_token_label=pad_token_label,
|
||||
only_label_first_subword=only_label_first_subword,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.vocab_size
|
||||
def vocab_size(self):
|
||||
@ -351,7 +351,9 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
|
||||
# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_vocab
|
||||
def get_vocab(self):
|
||||
return dict(self.encoder, **self.added_tokens_encoder)
|
||||
vocab = dict(self.encoder).copy()
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.bpe
|
||||
def bpe(self, token):
|
||||
@ -539,7 +541,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
if (
|
||||
(is_split_into_words or add_prefix_space)
|
||||
and (len(text) > 0 and not text[0].isspace())
|
||||
and sum([text.startswith(no_split_token) for no_split_token in self.unique_no_split_tokens]) == 0
|
||||
and sum([text.startswith(no_split_token) for no_split_token in self.added_tokens_encoder]) == 0
|
||||
):
|
||||
text = " " + text
|
||||
return (text, kwargs)
|
||||
|
@ -254,23 +254,6 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
|
||||
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
cls_token_box=cls_token_box,
|
||||
sep_token_box=sep_token_box,
|
||||
pad_token_box=pad_token_box,
|
||||
pad_token_label=pad_token_label,
|
||||
only_label_first_subword=only_label_first_subword,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(str(vocab_file))
|
||||
self.vocab_file = vocab_file
|
||||
@ -297,6 +280,23 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
|
||||
self.pad_token_label = pad_token_label
|
||||
self.only_label_first_subword = only_label_first_subword
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
cls_token_box=cls_token_box,
|
||||
sep_token_box=sep_token_box,
|
||||
pad_token_box=pad_token_box,
|
||||
pad_token_label=pad_token_label,
|
||||
only_label_first_subword=only_label_first_subword,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def __getstate__(self):
|
||||
state = self.__dict__.copy()
|
||||
state["sp_model"] = None
|
||||
|
@ -197,21 +197,10 @@ class LEDTokenizer(PreTrainedTokenizer):
|
||||
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
||||
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
# TODO seems like both slow and fast actually don't strip left and right soooooooo yeah. See `test_embeded_special_tokens`
|
||||
# Also this not only will strip the spaces but any punctuation
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
@ -228,6 +217,19 @@ class LEDTokenizer(PreTrainedTokenizer):
|
||||
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
|
||||
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
|
||||
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
# Copied from transformers.models.bart.tokenization_bart.BartTokenizer.vocab_size
|
||||
def vocab_size(self):
|
||||
|
@ -152,6 +152,7 @@ class LEDTokenizerFast(PreTrainedTokenizerFast):
|
||||
trim_offsets=True,
|
||||
**kwargs,
|
||||
):
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
merges_file,
|
||||
|
@ -122,20 +122,7 @@ class LlamaTokenizer(PreTrainedTokenizer):
|
||||
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
||||
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
||||
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
add_bos_token=add_bos_token,
|
||||
add_eos_token=add_eos_token,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||
use_default_system_prompt=use_default_system_prompt,
|
||||
spaces_between_special_tokens=spaces_between_special_tokens,
|
||||
legacy=legacy,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if legacy is None:
|
||||
logger.warning_once(
|
||||
f"You are using the default legacy behaviour of the {self.__class__}. If you see this, DO NOT PANIC! This is"
|
||||
@ -151,9 +138,23 @@ class LlamaTokenizer(PreTrainedTokenizer):
|
||||
self.add_bos_token = add_bos_token
|
||||
self.add_eos_token = add_eos_token
|
||||
self.use_default_system_prompt = use_default_system_prompt
|
||||
|
||||
self.sp_model = self.get_spm_processor()
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
add_bos_token=add_bos_token,
|
||||
add_eos_token=add_eos_token,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||
use_default_system_prompt=use_default_system_prompt,
|
||||
spaces_between_special_tokens=spaces_between_special_tokens,
|
||||
legacy=legacy,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def unk_token_length(self):
|
||||
return len(self.sp_model.encode(str(self.unk_token)))
|
||||
|
@ -33,6 +33,14 @@ else:
|
||||
logger = logging.get_logger(__name__)
|
||||
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"}
|
||||
|
||||
PRETRAINED_VOCAB_FILES_MAP = {
|
||||
"vocab_file": {
|
||||
"hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
|
||||
},
|
||||
"tokenizer_file": {
|
||||
"hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
|
||||
},
|
||||
}
|
||||
B_INST, E_INST = "[INST]", "[/INST]"
|
||||
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
|
||||
|
||||
@ -93,6 +101,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
slow_tokenizer_class = LlamaTokenizer
|
||||
padding_side = "left"
|
||||
model_input_names = ["input_ids", "attention_mask"]
|
||||
|
@ -212,28 +212,21 @@ class LongformerTokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
):
|
||||
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
|
||||
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
||||
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
||||
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
||||
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
|
||||
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
|
||||
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
||||
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
||||
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
**kwargs,
|
||||
mask_token = (
|
||||
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
|
||||
if isinstance(mask_token, str)
|
||||
else mask_token
|
||||
)
|
||||
|
||||
# these special tokens are not part of the vocab.json, let's add them in the correct order
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
@ -250,12 +243,27 @@ class LongformerTokenizer(PreTrainedTokenizer):
|
||||
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
|
||||
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
|
||||
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return len(self.encoder)
|
||||
|
||||
def get_vocab(self):
|
||||
return dict(self.encoder, **self.added_tokens_encoder)
|
||||
vocab = dict(self.encoder).copy()
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def bpe(self, token):
|
||||
if token in self.cache:
|
||||
|
@ -192,6 +192,11 @@ class LongformerTokenizerFast(PreTrainedTokenizerFast):
|
||||
trim_offsets=True,
|
||||
**kwargs,
|
||||
):
|
||||
mask_token = (
|
||||
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
|
||||
if isinstance(mask_token, str)
|
||||
else mask_token
|
||||
)
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
merges_file,
|
||||
|
@ -326,28 +326,6 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
task=task,
|
||||
max_entity_length=32,
|
||||
max_mention_length=30,
|
||||
entity_token_1="<ent>",
|
||||
entity_token_2="<ent2>",
|
||||
entity_unk_token=entity_unk_token,
|
||||
entity_pad_token=entity_pad_token,
|
||||
entity_mask_token=entity_mask_token,
|
||||
entity_mask2_token=entity_mask2_token,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
@ -407,6 +385,28 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
|
||||
self.max_mention_length = max_mention_length
|
||||
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
task=task,
|
||||
max_entity_length=32,
|
||||
max_mention_length=30,
|
||||
entity_token_1="<ent>",
|
||||
entity_token_2="<ent2>",
|
||||
entity_unk_token=entity_unk_token,
|
||||
entity_pad_token=entity_pad_token,
|
||||
entity_mask_token=entity_mask_token,
|
||||
entity_mask2_token=entity_mask2_token,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.vocab_size with Roberta->Luke, RoBERTa->LUKE
|
||||
def vocab_size(self):
|
||||
@ -414,7 +414,9 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
|
||||
# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_vocab with Roberta->Luke, RoBERTa->LUKE
|
||||
def get_vocab(self):
|
||||
return dict(self.encoder, **self.added_tokens_encoder)
|
||||
vocab = dict(self.encoder).copy()
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.bpe with Roberta->Luke, RoBERTa->LUKE
|
||||
def bpe(self, token):
|
||||
|
@ -126,20 +126,6 @@ class LxmertTokenizer(PreTrainedTokenizer):
|
||||
strip_accents=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
@ -155,7 +141,22 @@ class LxmertTokenizer(PreTrainedTokenizer):
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
||||
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def do_lower_case(self):
|
||||
|
@ -150,26 +150,11 @@ class M2M100Tokenizer(PreTrainedTokenizer):
|
||||
fairseq_language_code = FAIRSEQ_LANGUAGE_CODES[language_codes]
|
||||
self.lang_code_to_token = {lang_code: f"__{lang_code}__" for lang_code in fairseq_language_code}
|
||||
|
||||
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
|
||||
kwargs["additional_special_tokens"] += [
|
||||
self.get_lang_token(lang_code)
|
||||
for lang_code in fairseq_language_code
|
||||
if self.get_lang_token(lang_code) not in kwargs["additional_special_tokens"]
|
||||
]
|
||||
|
||||
super().__init__(
|
||||
src_lang=src_lang,
|
||||
tgt_lang=tgt_lang,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
sep_token=sep_token,
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
language_codes=language_codes,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
num_madeup_words=num_madeup_words,
|
||||
**kwargs,
|
||||
)
|
||||
additional_special_tokens = kwargs.pop("additional_special_tokens", [])
|
||||
for lang_code in fairseq_language_code:
|
||||
token = self.get_lang_token(lang_code)
|
||||
if token not in additional_special_tokens and lang_code not in str(token) not in self.added_tokens_encoder:
|
||||
additional_special_tokens.append(token)
|
||||
|
||||
self.vocab_file = vocab_file
|
||||
self.encoder = load_json(vocab_file)
|
||||
@ -188,13 +173,33 @@ class M2M100Tokenizer(PreTrainedTokenizer):
|
||||
self._src_lang = src_lang if src_lang is not None else "en"
|
||||
self.tgt_lang = tgt_lang
|
||||
self.cur_lang_id = self.get_lang_id(self._src_lang)
|
||||
self.set_src_lang_special_tokens(self._src_lang)
|
||||
|
||||
self.num_madeup_words = num_madeup_words
|
||||
|
||||
super().__init__(
|
||||
src_lang=src_lang,
|
||||
tgt_lang=tgt_lang,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
sep_token=sep_token,
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
language_codes=language_codes,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
num_madeup_words=num_madeup_words,
|
||||
**kwargs,
|
||||
)
|
||||
self.set_src_lang_special_tokens(self._src_lang)
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> int:
|
||||
return len(self.encoder) + len(self.lang_token_to_id)
|
||||
return len(self.encoder)
|
||||
|
||||
def get_vocab(self) -> Dict:
|
||||
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
@property
|
||||
def src_lang(self) -> str:
|
||||
@ -290,11 +295,6 @@ class M2M100Tokenizer(PreTrainedTokenizer):
|
||||
# We don't expect to process pairs, but leave the pair logic for API consistency
|
||||
return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
|
||||
|
||||
def get_vocab(self) -> Dict:
|
||||
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def __getstate__(self) -> Dict:
|
||||
state = self.__dict__.copy()
|
||||
state["sp_model"] = None
|
||||
|
@ -144,26 +144,13 @@ class MarianTokenizer(PreTrainedTokenizer):
|
||||
) -> None:
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
super().__init__(
|
||||
# bos_token=bos_token, unused. Start decoding with config.decoder_start_token_id
|
||||
source_lang=source_lang,
|
||||
target_lang=target_lang,
|
||||
unk_token=unk_token,
|
||||
eos_token=eos_token,
|
||||
pad_token=pad_token,
|
||||
model_max_length=model_max_length,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
target_vocab_file=target_vocab_file,
|
||||
separate_vocabs=separate_vocabs,
|
||||
**kwargs,
|
||||
)
|
||||
assert Path(source_spm).exists(), f"cannot find spm source {source_spm}"
|
||||
|
||||
self.separate_vocabs = separate_vocabs
|
||||
self.encoder = load_json(vocab)
|
||||
if self.unk_token not in self.encoder:
|
||||
raise KeyError("<unk> token must be in vocab")
|
||||
assert self.pad_token in self.encoder
|
||||
if unk_token not in self.encoder:
|
||||
raise KeyError("<unk> token must be in the vocab")
|
||||
assert pad_token in self.encoder
|
||||
|
||||
if separate_vocabs:
|
||||
self.target_encoder = load_json(target_vocab_file)
|
||||
@ -187,6 +174,20 @@ class MarianTokenizer(PreTrainedTokenizer):
|
||||
|
||||
self._setup_normalizer()
|
||||
|
||||
super().__init__(
|
||||
# bos_token=bos_token, unused. Start decoding with config.decoder_start_token_id
|
||||
source_lang=source_lang,
|
||||
target_lang=target_lang,
|
||||
unk_token=unk_token,
|
||||
eos_token=eos_token,
|
||||
pad_token=pad_token,
|
||||
model_max_length=model_max_length,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
target_vocab_file=target_vocab_file,
|
||||
separate_vocabs=separate_vocabs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _setup_normalizer(self):
|
||||
try:
|
||||
from sacremoses import MosesPunctNormalizer
|
||||
|
@ -232,27 +232,6 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
vocab_file=vocab_file,
|
||||
merges_file=merges_file,
|
||||
tags_dict=tags_dict,
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
max_depth=max_depth,
|
||||
max_width=max_width,
|
||||
pad_width=pad_width,
|
||||
pad_token_label=pad_token_label,
|
||||
only_label_first_subword=only_label_first_subword,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
|
||||
@ -279,6 +258,28 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
self.pad_tag_id = self.unk_tag_id + 1
|
||||
self.pad_xpath_tags_seq = [self.pad_tag_id] * self.max_depth
|
||||
self.pad_xpath_subs_seq = [self.pad_width] * self.max_depth
|
||||
|
||||
super().__init__(
|
||||
vocab_file=vocab_file,
|
||||
merges_file=merges_file,
|
||||
tags_dict=tags_dict,
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
max_depth=max_depth,
|
||||
max_width=max_width,
|
||||
pad_width=pad_width,
|
||||
pad_token_label=pad_token_label,
|
||||
only_label_first_subword=only_label_first_subword,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.pad_token_label = pad_token_label
|
||||
self.only_label_first_subword = only_label_first_subword
|
||||
|
||||
@ -312,7 +313,9 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
return len(self.encoder)
|
||||
|
||||
def get_vocab(self):
|
||||
return dict(self.encoder, **self.added_tokens_encoder)
|
||||
vocab = self.encoder.copy()
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def bpe(self, token):
|
||||
if token in self.cache:
|
||||
|
@ -26,6 +26,7 @@ from tokenizers import pre_tokenizers, processors
|
||||
from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings
|
||||
from ...tokenization_utils_base import (
|
||||
ENCODE_KWARGS_DOCSTRING,
|
||||
AddedToken,
|
||||
BatchEncoding,
|
||||
EncodedInput,
|
||||
PreTokenizedInput,
|
||||
@ -182,6 +183,16 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
trim_offsets=False,
|
||||
**kwargs,
|
||||
):
|
||||
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
|
||||
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
||||
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
|
||||
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
|
||||
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
||||
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
||||
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
vocab_file=vocab_file,
|
||||
merges_file=merges_file,
|
||||
|
@ -101,22 +101,6 @@ class MBartTokenizer(PreTrainedTokenizer):
|
||||
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
tokenizer_file=None,
|
||||
src_lang=src_lang,
|
||||
tgt_lang=tgt_lang,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(str(vocab_file))
|
||||
self.vocab_file = vocab_file
|
||||
@ -142,14 +126,30 @@ class MBartTokenizer(PreTrainedTokenizer):
|
||||
|
||||
self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
|
||||
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
|
||||
self._additional_special_tokens = list(self.lang_code_to_id.keys())
|
||||
_additional_special_tokens = list(self.lang_code_to_id.keys())
|
||||
|
||||
if additional_special_tokens is not None:
|
||||
# Only add those special tokens if they are not already there.
|
||||
self._additional_special_tokens.extend(
|
||||
[t for t in additional_special_tokens if t not in self._additional_special_tokens]
|
||||
_additional_special_tokens.extend(
|
||||
[t for t in additional_special_tokens if t not in _additional_special_tokens]
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
tokenizer_file=None,
|
||||
src_lang=src_lang,
|
||||
tgt_lang=tgt_lang,
|
||||
additional_special_tokens=_additional_special_tokens,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self._src_lang = src_lang if src_lang is not None else "en_XX"
|
||||
self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
|
||||
self.tgt_lang = tgt_lang
|
||||
|
@ -112,6 +112,14 @@ class MBartTokenizerFast(PreTrainedTokenizerFast):
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
_additional_special_tokens = FAIRSEQ_LANGUAGE_CODES.copy()
|
||||
|
||||
if additional_special_tokens is not None:
|
||||
# Only add those special tokens if they are not already there.
|
||||
_additional_special_tokens.extend(
|
||||
[t for t in additional_special_tokens if t not in _additional_special_tokens]
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
vocab_file=vocab_file,
|
||||
tokenizer_file=tokenizer_file,
|
||||
@ -124,21 +132,11 @@ class MBartTokenizerFast(PreTrainedTokenizerFast):
|
||||
mask_token=mask_token,
|
||||
src_lang=src_lang,
|
||||
tgt_lang=tgt_lang,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
additional_special_tokens=_additional_special_tokens,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_file = vocab_file
|
||||
|
||||
_additional_special_tokens = FAIRSEQ_LANGUAGE_CODES.copy()
|
||||
|
||||
if additional_special_tokens is not None:
|
||||
# Only add those special tokens if they are not already there.
|
||||
_additional_special_tokens.extend(
|
||||
[t for t in additional_special_tokens if t not in _additional_special_tokens]
|
||||
)
|
||||
|
||||
self.add_special_tokens({"additional_special_tokens": _additional_special_tokens})
|
||||
self.lang_code_to_id = {
|
||||
lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES
|
||||
}
|
||||
|
@ -137,19 +137,6 @@ class MBart50Tokenizer(PreTrainedTokenizer):
|
||||
code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"]
|
||||
]
|
||||
|
||||
super().__init__(
|
||||
src_lang=src_lang,
|
||||
tgt_lang=tgt_lang,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(str(vocab_file))
|
||||
self.vocab_file = vocab_file
|
||||
@ -176,6 +163,19 @@ class MBart50Tokenizer(PreTrainedTokenizer):
|
||||
self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
|
||||
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
|
||||
|
||||
super().__init__(
|
||||
src_lang=src_lang,
|
||||
tgt_lang=tgt_lang,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self._src_lang = src_lang if src_lang is not None else "en_XX"
|
||||
self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
|
||||
self.tgt_lang = tgt_lang
|
||||
|
@ -62,6 +62,9 @@ class MgpstrTokenizer(PreTrainedTokenizer):
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
|
||||
def __init__(self, vocab_file, unk_token="[GO]", bos_token="[GO]", eos_token="[s]", pad_token="[GO]", **kwargs):
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.vocab = json.load(vocab_handle)
|
||||
self.decoder = {v: k for k, v in self.vocab.items()}
|
||||
super().__init__(
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
@ -70,16 +73,14 @@ class MgpstrTokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.vocab = json.load(vocab_handle)
|
||||
self.decoder = {v: k for k, v in self.vocab.items()}
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return len(self.vocab)
|
||||
|
||||
def get_vocab(self):
|
||||
return dict(self.vocab, **self.added_tokens_encoder)
|
||||
vocab = dict(self.vocab).copy()
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def _tokenize(self, text):
|
||||
"""Tokenize a string."""
|
||||
|
@ -272,32 +272,11 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
if isinstance(entity_token_2, str)
|
||||
else entity_token_2
|
||||
)
|
||||
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
|
||||
kwargs["additional_special_tokens"] += [entity_token_1, entity_token_2]
|
||||
additional_special_tokens = kwargs.pop("additional_special_tokens", [])
|
||||
additional_special_tokens += [entity_token_1, entity_token_2]
|
||||
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
task=task,
|
||||
max_entity_length=max_entity_length,
|
||||
max_mention_length=max_mention_length,
|
||||
entity_token_1=entity_token_1,
|
||||
entity_token_2=entity_token_2,
|
||||
entity_unk_token=entity_unk_token,
|
||||
entity_pad_token=entity_pad_token,
|
||||
entity_mask_token=entity_mask_token,
|
||||
entity_mask2_token=entity_mask2_token,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(str(vocab_file))
|
||||
self.vocab_file = vocab_file
|
||||
@ -345,6 +324,65 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
|
||||
self.max_mention_length = max_mention_length
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
task=task,
|
||||
max_entity_length=max_entity_length,
|
||||
max_mention_length=max_mention_length,
|
||||
entity_token_1=entity_token_1,
|
||||
entity_token_2=entity_token_2,
|
||||
entity_unk_token=entity_unk_token,
|
||||
entity_pad_token=entity_pad_token,
|
||||
entity_mask_token=entity_mask_token,
|
||||
entity_mask2_token=entity_mask2_token,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
# Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer.vocab_size
|
||||
def vocab_size(self):
|
||||
return len(self.sp_model) + self.fairseq_offset + 1 # Add the <mask> token
|
||||
|
||||
# Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer.get_vocab
|
||||
def get_vocab(self):
|
||||
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
# Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer._tokenize
|
||||
def _tokenize(self, text: str) -> List[str]:
|
||||
# TODO check if the t5/llama PR also applies here
|
||||
return self.sp_model.encode(text, out_type=str)
|
||||
|
||||
# Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer._convert_token_to_id
|
||||
def _convert_token_to_id(self, token):
|
||||
"""Converts a token (str) in an id using the vocab."""
|
||||
if token in self.fairseq_tokens_to_ids:
|
||||
return self.fairseq_tokens_to_ids[token]
|
||||
spm_id = self.sp_model.PieceToId(token)
|
||||
|
||||
# Need to return unknown token if the SP model returned 0
|
||||
return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
if index in self.fairseq_ids_to_tokens:
|
||||
return self.fairseq_ids_to_tokens[index]
|
||||
return self.sp_model.IdToPiece(index - self.fairseq_offset)
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
|
||||
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
|
||||
return out_string
|
||||
|
||||
def __getstate__(self):
|
||||
state = self.__dict__.copy()
|
||||
state["sp_model"] = None
|
||||
@ -1591,39 +1629,3 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
if token_ids_1 is None:
|
||||
return len(cls + token_ids_0 + sep) * [0]
|
||||
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
|
||||
|
||||
@property
|
||||
# Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer.vocab_size
|
||||
def vocab_size(self):
|
||||
return len(self.sp_model) + self.fairseq_offset + 1 # Add the <mask> token
|
||||
|
||||
# Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer.get_vocab
|
||||
def get_vocab(self):
|
||||
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
# Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer._tokenize
|
||||
def _tokenize(self, text: str) -> List[str]:
|
||||
return self.sp_model.encode(text, out_type=str)
|
||||
|
||||
# Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer._convert_token_to_id
|
||||
def _convert_token_to_id(self, token):
|
||||
"""Converts a token (str) in an id using the vocab."""
|
||||
if token in self.fairseq_tokens_to_ids:
|
||||
return self.fairseq_tokens_to_ids[token]
|
||||
spm_id = self.sp_model.PieceToId(token)
|
||||
|
||||
# Need to return unknown token if the SP model returned 0
|
||||
return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
if index in self.fairseq_ids_to_tokens:
|
||||
return self.fairseq_ids_to_tokens[index]
|
||||
return self.sp_model.IdToPiece(index - self.fairseq_offset)
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
|
||||
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
|
||||
return out_string
|
||||
|
@ -124,20 +124,6 @@ class MobileBertTokenizer(PreTrainedTokenizer):
|
||||
strip_accents=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
@ -153,7 +139,22 @@ class MobileBertTokenizer(PreTrainedTokenizer):
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
||||
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def do_lower_case(self):
|
||||
|
@ -157,6 +157,23 @@ class MPNetTokenizer(PreTrainedTokenizer):
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
" model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
|
||||
)
|
||||
self.vocab = load_vocab(vocab_file)
|
||||
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
|
||||
self.do_basic_tokenize = do_basic_tokenize
|
||||
if do_basic_tokenize:
|
||||
self.basic_tokenizer = BasicTokenizer(
|
||||
do_lower_case=do_lower_case,
|
||||
never_split=never_split,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
@ -173,23 +190,6 @@ class MPNetTokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
" model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
|
||||
)
|
||||
self.vocab = load_vocab(vocab_file)
|
||||
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
|
||||
self.do_basic_tokenize = do_basic_tokenize
|
||||
if do_basic_tokenize:
|
||||
self.basic_tokenizer = BasicTokenizer(
|
||||
do_lower_case=do_lower_case,
|
||||
never_split=never_split,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
||||
|
||||
@property
|
||||
def do_lower_case(self):
|
||||
return self.basic_tokenizer.do_lower_case
|
||||
@ -199,7 +199,9 @@ class MPNetTokenizer(PreTrainedTokenizer):
|
||||
return len(self.vocab)
|
||||
|
||||
def get_vocab(self):
|
||||
return dict(self.vocab, **self.added_tokens_encoder)
|
||||
vocab = self.vocab.copy()
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def _tokenize(self, text):
|
||||
split_tokens = []
|
||||
|
@ -126,6 +126,16 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast):
|
||||
strip_accents=None,
|
||||
**kwargs,
|
||||
):
|
||||
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
|
||||
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
||||
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
|
||||
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
|
||||
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
||||
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
||||
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
tokenizer_file=tokenizer_file,
|
||||
|
@ -193,19 +193,6 @@ class MvpTokenizer(PreTrainedTokenizer):
|
||||
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
**kwargs,
|
||||
)
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
@ -222,12 +209,27 @@ class MvpTokenizer(PreTrainedTokenizer):
|
||||
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
|
||||
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
|
||||
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return len(self.encoder)
|
||||
|
||||
def get_vocab(self):
|
||||
return dict(self.encoder, **self.added_tokens_encoder)
|
||||
vocab = self.encoder.copy()
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def bpe(self, token):
|
||||
if token in self.cache:
|
||||
|
@ -153,6 +153,15 @@ class MvpTokenizerFast(PreTrainedTokenizerFast):
|
||||
trim_offsets=True,
|
||||
**kwargs,
|
||||
):
|
||||
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
|
||||
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
||||
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
|
||||
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
|
||||
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
||||
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
||||
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
merges_file,
|
||||
|
@ -149,23 +149,6 @@ class NllbTokenizer(PreTrainedTokenizer):
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
self.legacy_behaviour = legacy_behaviour
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
tokenizer_file=tokenizer_file,
|
||||
src_lang=src_lang,
|
||||
tgt_lang=tgt_lang,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
legacy_behaviour=legacy_behaviour,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(str(vocab_file))
|
||||
self.vocab_file = vocab_file
|
||||
@ -190,16 +173,35 @@ class NllbTokenizer(PreTrainedTokenizer):
|
||||
|
||||
self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
|
||||
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
|
||||
self._additional_special_tokens = list(self.lang_code_to_id.keys())
|
||||
|
||||
if additional_special_tokens is not None:
|
||||
# Only add those special tokens if they are not already there.
|
||||
self._additional_special_tokens.extend(
|
||||
[t for t in additional_special_tokens if t not in self._additional_special_tokens]
|
||||
)
|
||||
|
||||
self._src_lang = src_lang if src_lang is not None else "eng_Latn"
|
||||
self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
|
||||
|
||||
_additional_special_tokens = list(self.lang_code_to_id.keys())
|
||||
|
||||
if additional_special_tokens is not None:
|
||||
# Only add those special tokens if they are not already there.
|
||||
_additional_special_tokens.extend(
|
||||
[t for t in additional_special_tokens if t not in _additional_special_tokens]
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
tokenizer_file=tokenizer_file,
|
||||
src_lang=src_lang,
|
||||
tgt_lang=tgt_lang,
|
||||
additional_special_tokens=_additional_special_tokens,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
legacy_behaviour=legacy_behaviour,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.tgt_lang = tgt_lang
|
||||
self.set_src_lang_special_tokens(self._src_lang)
|
||||
|
||||
|
@ -157,6 +157,15 @@ class NllbTokenizerFast(PreTrainedTokenizerFast):
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
self.legacy_behaviour = legacy_behaviour
|
||||
|
||||
_additional_special_tokens = FAIRSEQ_LANGUAGE_CODES.copy()
|
||||
|
||||
if additional_special_tokens is not None:
|
||||
# Only add those special tokens if they are not already there.
|
||||
_additional_special_tokens.extend(
|
||||
[t for t in additional_special_tokens if t not in _additional_special_tokens]
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
vocab_file=vocab_file,
|
||||
tokenizer_file=tokenizer_file,
|
||||
@ -169,22 +178,13 @@ class NllbTokenizerFast(PreTrainedTokenizerFast):
|
||||
mask_token=mask_token,
|
||||
src_lang=src_lang,
|
||||
tgt_lang=tgt_lang,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
additional_special_tokens=_additional_special_tokens,
|
||||
legacy_behaviour=legacy_behaviour,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_file = vocab_file
|
||||
|
||||
_additional_special_tokens = FAIRSEQ_LANGUAGE_CODES.copy()
|
||||
|
||||
if additional_special_tokens is not None:
|
||||
# Only add those special tokens if they are not already there.
|
||||
_additional_special_tokens.extend(
|
||||
[t for t in additional_special_tokens if t not in _additional_special_tokens]
|
||||
)
|
||||
|
||||
self.add_special_tokens({"additional_special_tokens": _additional_special_tokens})
|
||||
self.lang_code_to_id = {
|
||||
lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES
|
||||
}
|
||||
|
@ -269,8 +269,6 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
|
||||
model_input_names = ["input_ids", "attention_mask"]
|
||||
|
||||
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
|
||||
super().__init__(unk_token=unk_token, **kwargs)
|
||||
|
||||
try:
|
||||
import ftfy
|
||||
from spacy.lang.en import English
|
||||
@ -292,6 +290,8 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
|
||||
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
||||
self.cache = {}
|
||||
|
||||
super().__init__(unk_token=unk_token, **kwargs)
|
||||
|
||||
@property
|
||||
def do_lower_case(self):
|
||||
return True
|
||||
|
@ -18,7 +18,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import sentencepiece as spm
|
||||
|
||||
from ...tokenization_utils import PreTrainedTokenizer
|
||||
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
@ -38,6 +38,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
# TODO ArthurZ refactor this to only use the added_tokens_encoder
|
||||
class PegasusTokenizer(PreTrainedTokenizer):
|
||||
r"""
|
||||
Construct a PEGASUS tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
|
||||
@ -95,8 +96,6 @@ class PegasusTokenizer(PreTrainedTokenizer):
|
||||
- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
|
||||
BPE-dropout.
|
||||
"""
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
@ -122,7 +121,6 @@ class PegasusTokenizer(PreTrainedTokenizer):
|
||||
f"additional_special_tokens should be of type {type(list)}, but is"
|
||||
f" {type(additional_special_tokens)}"
|
||||
)
|
||||
|
||||
additional_special_tokens_extended = (
|
||||
([mask_token_sent] + additional_special_tokens)
|
||||
if mask_token_sent not in additional_special_tokens and mask_token_sent is not None
|
||||
@ -140,10 +138,27 @@ class PegasusTokenizer(PreTrainedTokenizer):
|
||||
)
|
||||
additional_special_tokens = additional_special_tokens_extended
|
||||
else:
|
||||
additional_special_tokens_extended = []
|
||||
additional_special_tokens = [mask_token_sent] if mask_token_sent is not None else []
|
||||
additional_special_tokens += [f"<unk_{i}>" for i in range(2, self.offset)]
|
||||
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
self.mask_token_sent = mask_token_sent
|
||||
self.vocab_file = vocab_file
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(vocab_file)
|
||||
|
||||
self._added_tokens_decoder = {
|
||||
0: AddedToken(str(pad_token), lstrip=True, rstrip=True),
|
||||
1: AddedToken(str(eos_token), lstrip=True, rstrip=True),
|
||||
}
|
||||
|
||||
if self.mask_token_sent is not None:
|
||||
self._added_tokens_decoder[2] = AddedToken(mask_token_sent)
|
||||
self._added_tokens_decoder[3] = AddedToken(str(mask_token))
|
||||
|
||||
for i in range(1, self.offset - 1):
|
||||
self._added_tokens_decoder[len(self._added_tokens_decoder)] = AddedToken(f"<unk_{i}>")
|
||||
|
||||
super().__init__(
|
||||
eos_token=eos_token,
|
||||
@ -156,31 +171,6 @@ class PegasusTokenizer(PreTrainedTokenizer):
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
self.mask_token_sent = mask_token_sent
|
||||
self.vocab_file = vocab_file
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(vocab_file)
|
||||
|
||||
# add special tokens to encoder dict
|
||||
self.encoder: Dict[int, str] = {
|
||||
0: self.pad_token,
|
||||
1: self.eos_token,
|
||||
}
|
||||
|
||||
if self.mask_token_sent is not None:
|
||||
self.encoder.update(
|
||||
{
|
||||
2: self.mask_token_sent,
|
||||
3: self.mask_token,
|
||||
}
|
||||
)
|
||||
|
||||
if self.offset > 0:
|
||||
# entries 2-104 are only used for pretraining and called <mask_1>, <mask_2>, unk_2, ...unk_102
|
||||
# mask_token_sent is already added to list -> so start at 1
|
||||
self.encoder.update({i + 3: additional_special_tokens[i] for i in range(1, self.offset - 1)})
|
||||
|
||||
self.decoder: Dict[str, int] = {v: k for k, v in self.encoder.items()}
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> int:
|
||||
@ -212,21 +202,14 @@ class PegasusTokenizer(PreTrainedTokenizer):
|
||||
|
||||
def _convert_token_to_id(self, token: str) -> int:
|
||||
"""Converts a token (str) to an id using the vocab."""
|
||||
if token in self.decoder:
|
||||
return self.decoder[token]
|
||||
elif token in self.added_tokens_decoder:
|
||||
return self.added_tokens_decoder[token]
|
||||
sp_id = self.sp_model.piece_to_id(token)
|
||||
return sp_id + self.offset
|
||||
|
||||
def _convert_id_to_token(self, index: int) -> str:
|
||||
"""Converts an index (integer) to a token (str) using the vocab."""
|
||||
if index in self.encoder:
|
||||
return self.encoder[index]
|
||||
elif index in self.added_tokens_encoder:
|
||||
return self.added_tokens_encoder[index]
|
||||
else:
|
||||
token = self.sp_model.IdToPiece(index - self.offset)
|
||||
if index < self.offset:
|
||||
return self.sp_model.IdToPiece(index)
|
||||
token = self.sp_model.IdToPiece(index - self.offset)
|
||||
return token
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
|
@ -75,6 +75,18 @@ class PerceiverTokenizer(PreTrainedTokenizer):
|
||||
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
|
||||
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
|
||||
|
||||
self._utf_vocab_size = 2**8 # utf is 8 bits
|
||||
|
||||
# Since these tokens are not part of the vocabulary, we manually add them
|
||||
self._added_tokens_decoder: Dict[str, int] = {
|
||||
0: pad_token,
|
||||
1: bos_token,
|
||||
2: eos_token,
|
||||
3: mask_token,
|
||||
4: cls_token,
|
||||
5: sep_token,
|
||||
}
|
||||
self._num_special_tokens = len(self._added_tokens_decoder)
|
||||
super().__init__(
|
||||
pad_token=pad_token,
|
||||
bos_token=bos_token,
|
||||
@ -86,31 +98,17 @@ class PerceiverTokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self._utf_vocab_size = 2**8 # utf is 8 bits
|
||||
|
||||
# define special tokens dict
|
||||
self.special_tokens_encoder: Dict[str, int] = {
|
||||
self.pad_token: 0,
|
||||
self.bos_token: 1,
|
||||
self.eos_token: 2,
|
||||
self.mask_token: 3,
|
||||
self.cls_token: 4,
|
||||
self.sep_token: 5,
|
||||
}
|
||||
self._num_special_tokens = len(self.special_tokens_encoder)
|
||||
self.special_tokens_decoder: Dict[int, str] = {v: k for k, v in self.special_tokens_encoder.items()}
|
||||
|
||||
def get_vocab(self) -> Dict[str, int]:
|
||||
vocab = self.special_tokens_encoder.copy()
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
vocab = {}
|
||||
for i in range(self._utf_vocab_size):
|
||||
token = chr(i)
|
||||
vocab[token] = i + len(self.special_tokens_encoder)
|
||||
vocab[token] = i + self._num_special_tokens
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return self._utf_vocab_size + self._num_special_tokens
|
||||
return self._utf_vocab_size
|
||||
|
||||
def get_special_tokens_mask(
|
||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
||||
@ -171,11 +169,7 @@ class PerceiverTokenizer(PreTrainedTokenizer):
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
"""Converts a token (str) in an id using the vocab."""
|
||||
if token in self.special_tokens_encoder:
|
||||
token_id = self.special_tokens_encoder[token]
|
||||
elif token in self.added_tokens_encoder:
|
||||
token_id = self.added_tokens_encoder[token]
|
||||
elif len(token) != 1:
|
||||
if len(token) != 1:
|
||||
token_id = self.unk_token_id
|
||||
else:
|
||||
token_id = ord(token) + self._num_special_tokens
|
||||
@ -183,26 +177,16 @@ class PerceiverTokenizer(PreTrainedTokenizer):
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
if index in self.special_tokens_decoder:
|
||||
token = self.special_tokens_decoder[index]
|
||||
elif index in self.added_tokens_decoder:
|
||||
token = self.added_tokens_decoder[index]
|
||||
else:
|
||||
token = chr(index - self._num_special_tokens)
|
||||
token = chr(index - self._num_special_tokens)
|
||||
return token
|
||||
|
||||
# TODO @ArthurZ refactor this as well....
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
"""Converts a sequence of tokens (string) in a single string."""
|
||||
bstring = b""
|
||||
for token in tokens:
|
||||
if token in self.special_tokens_decoder:
|
||||
tok_string = self.special_tokens_decoder[token].encode("utf-8")
|
||||
elif token in self.added_tokens_decoder:
|
||||
tok_string = self.special_tokens_decoder[token].encode("utf-8")
|
||||
elif token in self.special_tokens_encoder:
|
||||
tok_string = token.encode("utf-8")
|
||||
elif token in self.added_tokens_encoder:
|
||||
tok_string = token.encode("utf-8")
|
||||
if token in self.added_tokens_encoder:
|
||||
tok_string = str(token).encode("utf-8")
|
||||
else:
|
||||
tok_string = bytes([ord(token)])
|
||||
bstring += tok_string
|
||||
|
@ -131,6 +131,26 @@ class PhobertTokenizer(PreTrainedTokenizer):
|
||||
mask_token="<mask>",
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_file = vocab_file
|
||||
self.merges_file = merges_file
|
||||
|
||||
self.encoder = {}
|
||||
self.encoder[bos_token] = 0
|
||||
self.encoder[pad_token] = 1
|
||||
self.encoder[eos_token] = 2
|
||||
self.encoder[unk_token] = 3
|
||||
|
||||
self.add_from_file(vocab_file)
|
||||
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
|
||||
with open(merges_file, encoding="utf-8") as merges_handle:
|
||||
merges = merges_handle.read().split("\n")[:-1]
|
||||
merges = [tuple(merge.split()[:-1]) for merge in merges]
|
||||
|
||||
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
||||
self.cache = {}
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
@ -142,25 +162,6 @@ class PhobertTokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_file = vocab_file
|
||||
self.merges_file = merges_file
|
||||
|
||||
self.encoder = {}
|
||||
self.encoder[self.bos_token] = 0
|
||||
self.encoder[self.pad_token] = 1
|
||||
self.encoder[self.eos_token] = 2
|
||||
self.encoder[self.unk_token] = 3
|
||||
|
||||
self.add_from_file(vocab_file)
|
||||
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
|
||||
with open(merges_file, encoding="utf-8") as merges_handle:
|
||||
merges = merges_handle.read().split("\n")[:-1]
|
||||
merges = [tuple(merge.split()[:-1]) for merge in merges]
|
||||
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
||||
self.cache = {}
|
||||
|
||||
def build_inputs_with_special_tokens(
|
||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||
) -> List[int]:
|
||||
|
@ -195,23 +195,6 @@ class PLBartTokenizer(PreTrainedTokenizer):
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
language_codes=language_codes,
|
||||
tokenizer_file=tokenizer_file,
|
||||
src_lang=src_lang,
|
||||
tgt_lang=tgt_lang,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
src_lang = self._convert_lang_code_special_format(src_lang)
|
||||
tgt_lang = self._convert_lang_code_special_format(tgt_lang)
|
||||
|
||||
@ -245,12 +228,12 @@ class PLBartTokenizer(PreTrainedTokenizer):
|
||||
|
||||
self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
|
||||
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
|
||||
self._additional_special_tokens = list(self.lang_code_to_id.keys())
|
||||
_additional_special_tokens = list(self.lang_code_to_id.keys())
|
||||
|
||||
if additional_special_tokens is not None:
|
||||
# Only add those special tokens if they are not already there.
|
||||
self._additional_special_tokens.extend(
|
||||
[t for t in additional_special_tokens if t not in self._additional_special_tokens]
|
||||
_additional_special_tokens.extend(
|
||||
[t for t in additional_special_tokens if t not in _additional_special_tokens]
|
||||
)
|
||||
|
||||
if self.language_codes == "base":
|
||||
@ -262,6 +245,23 @@ class PLBartTokenizer(PreTrainedTokenizer):
|
||||
self._src_lang = src_lang if src_lang is not None else "__en_XX__"
|
||||
self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
language_codes=language_codes,
|
||||
tokenizer_file=tokenizer_file,
|
||||
src_lang=src_lang,
|
||||
tgt_lang=tgt_lang,
|
||||
additional_special_tokens=_additional_special_tokens,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.tgt_lang = tgt_lang
|
||||
self.set_src_lang_special_tokens(self._src_lang)
|
||||
|
||||
|
@ -101,14 +101,6 @@ class Pop2PianoTokenizer(PreTrainedTokenizer):
|
||||
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
||||
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
|
||||
|
||||
super().__init__(
|
||||
unk_token=unk_token,
|
||||
eos_token=eos_token,
|
||||
pad_token=pad_token,
|
||||
bos_token=bos_token,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.default_velocity = default_velocity
|
||||
self.num_bars = num_bars
|
||||
|
||||
@ -119,6 +111,14 @@ class Pop2PianoTokenizer(PreTrainedTokenizer):
|
||||
# create mappings for encoder
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
|
||||
super().__init__(
|
||||
unk_token=unk_token,
|
||||
eos_token=eos_token,
|
||||
pad_token=pad_token,
|
||||
bos_token=bos_token,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
"""Returns the vocabulary size of the tokenizer."""
|
||||
|
@ -354,21 +354,6 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
|
||||
strip_accents: Optional[bool] = None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
x_sep_token=x_sep_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
self.unique_no_split_tokens.append(x_sep_token)
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
@ -384,7 +369,21 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
x_sep_token=x_sep_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
|
@ -157,20 +157,6 @@ class RealmTokenizer(PreTrainedTokenizer):
|
||||
strip_accents=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
@ -186,7 +172,20 @@ class RealmTokenizer(PreTrainedTokenizer):
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def do_lower_case(self):
|
||||
|
@ -106,6 +106,10 @@ class ReformerTokenizer(PreTrainedTokenizer):
|
||||
) -> None:
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
self.vocab_file = vocab_file
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(vocab_file)
|
||||
|
||||
super().__init__(
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
@ -114,10 +118,6 @@ class ReformerTokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_file = vocab_file
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(vocab_file)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return self.sp_model.get_piece_size()
|
||||
|
@ -111,6 +111,13 @@ class RemBertTokenizer(PreTrainedTokenizer):
|
||||
mask_token="[MASK]",
|
||||
**kwargs,
|
||||
):
|
||||
self.do_lower_case = do_lower_case
|
||||
self.remove_space = remove_space
|
||||
self.keep_accents = keep_accents
|
||||
self.vocab_file = vocab_file
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor()
|
||||
self.sp_model.Load(vocab_file)
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
remove_space=remove_space,
|
||||
@ -125,14 +132,6 @@ class RemBertTokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.do_lower_case = do_lower_case
|
||||
self.remove_space = remove_space
|
||||
self.keep_accents = keep_accents
|
||||
self.vocab_file = vocab_file
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor()
|
||||
self.sp_model.Load(vocab_file)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return len(self.sp_model)
|
||||
|
@ -203,28 +203,21 @@ class RobertaTokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
):
|
||||
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
|
||||
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
||||
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
||||
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
||||
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
|
||||
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
|
||||
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
||||
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
||||
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
**kwargs,
|
||||
mask_token = (
|
||||
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
|
||||
if isinstance(mask_token, str)
|
||||
else mask_token
|
||||
)
|
||||
|
||||
# these special tokens are not part of the vocab.json, let's add them in the correct order
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
@ -241,12 +234,27 @@ class RobertaTokenizer(PreTrainedTokenizer):
|
||||
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
|
||||
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
|
||||
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return len(self.encoder)
|
||||
|
||||
def get_vocab(self):
|
||||
return dict(self.encoder, **self.added_tokens_encoder)
|
||||
vocab = dict(self.encoder).copy()
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def bpe(self, token):
|
||||
if token in self.cache:
|
||||
|
@ -177,6 +177,11 @@ class RobertaTokenizerFast(PreTrainedTokenizerFast):
|
||||
trim_offsets=True,
|
||||
**kwargs,
|
||||
):
|
||||
mask_token = (
|
||||
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
|
||||
if isinstance(mask_token, str)
|
||||
else mask_token
|
||||
)
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
merges_file,
|
||||
|
@ -156,20 +156,6 @@ class RoCBertTokenizer(PreTrainedTokenizer):
|
||||
strip_accents=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
for cur_file in [vocab_file, word_shape_file, word_pronunciation_file]:
|
||||
if cur_file is None or not os.path.isfile(cur_file):
|
||||
raise ValueError(
|
||||
@ -195,7 +181,20 @@ class RoCBertTokenizer(PreTrainedTokenizer):
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
)
|
||||
self.wordpiece_tokenizer = RoCBertWordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
||||
self.wordpiece_tokenizer = RoCBertWordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def do_lower_case(self):
|
||||
|
@ -378,20 +378,6 @@ class RoFormerTokenizer(PreTrainedTokenizer):
|
||||
strip_accents=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
@ -407,7 +393,7 @@ class RoFormerTokenizer(PreTrainedTokenizer):
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
||||
try:
|
||||
import rjieba
|
||||
except ImportError:
|
||||
@ -417,6 +403,20 @@ class RoFormerTokenizer(PreTrainedTokenizer):
|
||||
)
|
||||
self.jieba = rjieba
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def do_lower_case(self):
|
||||
return self.basic_tokenizer.do_lower_case
|
||||
|
@ -122,23 +122,12 @@ class Speech2TextTokenizer(PreTrainedTokenizer):
|
||||
do_lower_case=False,
|
||||
tgt_lang=None,
|
||||
lang_codes=None,
|
||||
additional_special_tokens=None,
|
||||
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
do_upper_case=do_upper_case,
|
||||
do_lower_case=do_lower_case,
|
||||
tgt_lang=tgt_lang,
|
||||
lang_codes=lang_codes,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
self.do_upper_case = do_upper_case
|
||||
self.do_lower_case = do_lower_case
|
||||
|
||||
@ -152,18 +141,39 @@ class Speech2TextTokenizer(PreTrainedTokenizer):
|
||||
self.langs = LANGUAGES[lang_codes]
|
||||
self.lang_tokens = [f"<lang:{lang}>" for lang in self.langs]
|
||||
self.lang_code_to_id = {lang: self.sp_model.PieceToId(f"<lang:{lang}>") for lang in self.langs}
|
||||
|
||||
self._additional_special_tokens = self.lang_tokens
|
||||
if additional_special_tokens is not None:
|
||||
additional_special_tokens = self.lang_tokens + additional_special_tokens
|
||||
else:
|
||||
additional_special_tokens = self.lang_tokens
|
||||
self._tgt_lang = tgt_lang if tgt_lang is not None else self.langs[0]
|
||||
|
||||
self.set_tgt_lang_special_tokens(self._tgt_lang)
|
||||
else:
|
||||
self.lang_code_to_id = {}
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
do_upper_case=do_upper_case,
|
||||
do_lower_case=do_lower_case,
|
||||
tgt_lang=tgt_lang,
|
||||
lang_codes=lang_codes,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> int:
|
||||
return len(self.encoder)
|
||||
|
||||
def get_vocab(self) -> Dict:
|
||||
vocab = self.encoder.copy()
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
@property
|
||||
def tgt_lang(self) -> str:
|
||||
return self._tgt_lang
|
||||
@ -241,11 +251,6 @@ class Speech2TextTokenizer(PreTrainedTokenizer):
|
||||
return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
|
||||
return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
|
||||
|
||||
def get_vocab(self) -> Dict:
|
||||
vocab = self.encoder.copy()
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def __getstate__(self) -> Dict:
|
||||
state = self.__dict__.copy()
|
||||
state["sp_model"] = None
|
||||
|
@ -110,15 +110,6 @@ class Speech2Text2Tokenizer(PreTrainedTokenizer):
|
||||
merges_file=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
pad_token=pad_token,
|
||||
do_lower_case=do_lower_case,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.do_lower_case = do_lower_case
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
@ -137,6 +128,14 @@ class Speech2Text2Tokenizer(PreTrainedTokenizer):
|
||||
merges = [tuple(merge.split()[:2]) for merge in merges]
|
||||
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
||||
self.cache = {}
|
||||
super().__init__(
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
pad_token=pad_token,
|
||||
do_lower_case=do_lower_case,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> int:
|
||||
|
@ -105,6 +105,12 @@ class SpeechT5Tokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
) -> None:
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
self.vocab_file = vocab_file
|
||||
self.normalize = normalize
|
||||
self._normalizer = None
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(vocab_file)
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
@ -116,13 +122,6 @@ class SpeechT5Tokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_file = vocab_file
|
||||
self.normalize = normalize
|
||||
self._normalizer = None
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(vocab_file)
|
||||
|
||||
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
|
||||
normalize = kwargs.pop("normalize", self.normalize)
|
||||
if is_split_into_words:
|
||||
|
@ -137,20 +137,6 @@ class SplinterTokenizer(PreTrainedTokenizer):
|
||||
strip_accents=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
@ -166,8 +152,21 @@ class SplinterTokenizer(PreTrainedTokenizer):
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
||||
self.question_token = question_token
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def question_token_id(self):
|
||||
|
@ -138,20 +138,6 @@ class SqueezeBertTokenizer(PreTrainedTokenizer):
|
||||
strip_accents=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
@ -167,7 +153,22 @@ class SqueezeBertTokenizer(PreTrainedTokenizer):
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
||||
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def do_lower_case(self):
|
||||
|
@ -25,6 +25,7 @@ import sentencepiece as spm
|
||||
|
||||
from ...convert_slow_tokenizer import import_protobuf
|
||||
from ...tokenization_utils import PreTrainedTokenizer
|
||||
from ...tokenization_utils_base import AddedToken
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -152,18 +153,37 @@ class T5Tokenizer(PreTrainedTokenizer):
|
||||
legacy=None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
# Add extra_ids to the special token list
|
||||
if extra_ids > 0 and additional_special_tokens is None:
|
||||
additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
|
||||
elif extra_ids > 0 and additional_special_tokens is not None:
|
||||
# Check that we have the right number of extra_id special tokens
|
||||
extra_tokens = len(set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens)))
|
||||
if extra_tokens != extra_ids:
|
||||
pad_token = AddedToken(pad_token, rstrip=True, lstrip=True)
|
||||
unk_token = AddedToken(unk_token, rstrip=True, lstrip=True)
|
||||
eos_token = AddedToken(eos_token, rstrip=True, lstrip=True)
|
||||
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
self.vocab_file = vocab_file
|
||||
self._extra_ids = extra_ids
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(vocab_file)
|
||||
|
||||
if additional_special_tokens is not None:
|
||||
extra_tokens = [x for x in additional_special_tokens if "<extra_id_" in str(x)]
|
||||
if extra_ids > 0 and extra_ids != len(extra_tokens):
|
||||
raise ValueError(
|
||||
f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
|
||||
" provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids"
|
||||
" tokens"
|
||||
)
|
||||
else:
|
||||
extra_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
|
||||
additional_special_tokens = extra_tokens
|
||||
|
||||
# for legacy purpose, we keep this. Will be removed and tests updated. (when `added_tokens_decoder` is not passed as kwargs)
|
||||
self._added_tokens_decoder = {}
|
||||
for i in range(len(extra_tokens)):
|
||||
self._added_tokens_decoder[len(self.sp_model) - 1 + extra_ids - i] = AddedToken(
|
||||
f"<extra_id_{i}>", single_word=True, lstrip=True, rstrip=True, special=True
|
||||
)
|
||||
|
||||
if legacy is None:
|
||||
logger.warning_once(
|
||||
f"You are using the default legacy behaviour of the {self.__class__}. If you see this, DO NOT PANIC! This is"
|
||||
@ -175,7 +195,9 @@ class T5Tokenizer(PreTrainedTokenizer):
|
||||
legacy = True
|
||||
|
||||
self.legacy = legacy
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
self.sp_model = self.get_spm_processor()
|
||||
self.vocab_file = vocab_file
|
||||
self._extra_ids = extra_ids
|
||||
|
||||
super().__init__(
|
||||
eos_token=eos_token,
|
||||
@ -188,11 +210,6 @@ class T5Tokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_file = vocab_file
|
||||
self._extra_ids = extra_ids
|
||||
|
||||
self.sp_model = self.get_spm_processor()
|
||||
|
||||
def get_spm_processor(self):
|
||||
tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
if self.legacy: # no dependency on protobuf
|
||||
@ -234,7 +251,7 @@ class T5Tokenizer(PreTrainedTokenizer):
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return self.sp_model.get_piece_size() + self._extra_ids
|
||||
return self.sp_model.get_piece_size()
|
||||
|
||||
def get_vocab(self):
|
||||
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
||||
@ -275,7 +292,7 @@ class T5Tokenizer(PreTrainedTokenizer):
|
||||
)
|
||||
|
||||
def get_sentinel_token_ids(self):
|
||||
return [self._convert_token_to_id(token) for token in self.get_sentinel_tokens()]
|
||||
return [self.convert_tokens_to_ids(token) for token in self.get_sentinel_tokens()]
|
||||
|
||||
def _add_eos_if_not_present(self, token_ids: List[int]) -> List[int]:
|
||||
"""Do not add eos again if user already added it."""
|
||||
@ -391,18 +408,11 @@ class T5Tokenizer(PreTrainedTokenizer):
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
"""Converts a token (str) in an id using the vocab."""
|
||||
if token.startswith("<extra_id_"):
|
||||
match = re.match(r"<extra_id_(\d+)>", token)
|
||||
num = int(match.group(1))
|
||||
return self.vocab_size - num - 1
|
||||
return self.sp_model.piece_to_id(token)
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
if index < self.sp_model.get_piece_size():
|
||||
token = self.sp_model.IdToPiece(index)
|
||||
else:
|
||||
token = f"<extra_id_{self.vocab_size - 1 - index}>"
|
||||
token = self.sp_model.IdToPiece(index)
|
||||
return token
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
|
@ -31,6 +31,7 @@ import numpy as np
|
||||
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
|
||||
from ...tokenization_utils_base import (
|
||||
ENCODE_KWARGS_DOCSTRING,
|
||||
VERY_LARGE_INTEGER,
|
||||
BatchEncoding,
|
||||
EncodedInput,
|
||||
PreTokenizedInput,
|
||||
@ -351,6 +352,44 @@ class TapasTokenizer(PreTrainedTokenizer):
|
||||
else:
|
||||
additional_special_tokens = [empty_token]
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
" model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
|
||||
)
|
||||
self.vocab = load_vocab(vocab_file)
|
||||
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
|
||||
self.do_basic_tokenize = do_basic_tokenize
|
||||
if do_basic_tokenize:
|
||||
self.basic_tokenizer = BasicTokenizer(
|
||||
do_lower_case=do_lower_case,
|
||||
never_split=never_split,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
||||
|
||||
# Additional properties
|
||||
self.cell_trim_length = cell_trim_length
|
||||
self.max_column_id = (
|
||||
max_column_id
|
||||
if max_column_id is not None
|
||||
else model_max_length
|
||||
if model_max_length is not None
|
||||
else VERY_LARGE_INTEGER
|
||||
)
|
||||
self.max_row_id = (
|
||||
max_row_id
|
||||
if max_row_id is not None
|
||||
else model_max_length
|
||||
if model_max_length is not None
|
||||
else VERY_LARGE_INTEGER
|
||||
)
|
||||
self.strip_column_names = strip_column_names
|
||||
self.update_answer_coordinates = update_answer_coordinates
|
||||
self.min_question_length = min_question_length
|
||||
self.max_question_length = max_question_length
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
do_basic_tokenize=do_basic_tokenize,
|
||||
@ -375,32 +414,6 @@ class TapasTokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
" model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
|
||||
)
|
||||
self.vocab = load_vocab(vocab_file)
|
||||
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
|
||||
self.do_basic_tokenize = do_basic_tokenize
|
||||
if do_basic_tokenize:
|
||||
self.basic_tokenizer = BasicTokenizer(
|
||||
do_lower_case=do_lower_case,
|
||||
never_split=never_split,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
||||
|
||||
# Additional properties
|
||||
self.cell_trim_length = cell_trim_length
|
||||
self.max_column_id = max_column_id if max_column_id is not None else self.model_max_length
|
||||
self.max_row_id = max_row_id if max_row_id is not None else self.model_max_length
|
||||
self.strip_column_names = strip_column_names
|
||||
self.update_answer_coordinates = update_answer_coordinates
|
||||
self.min_question_length = min_question_length
|
||||
self.max_question_length = max_question_length
|
||||
|
||||
@property
|
||||
def do_lower_case(self):
|
||||
return self.basic_tokenizer.do_lower_case
|
||||
|
@ -181,25 +181,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
||||
language="en",
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
special=special,
|
||||
min_freq=min_freq,
|
||||
max_size=max_size,
|
||||
lower_case=lower_case,
|
||||
delimiter=delimiter,
|
||||
vocab_file=vocab_file,
|
||||
pretrained_vocab_file=pretrained_vocab_file,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
eos_token=eos_token,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
language=language,
|
||||
**kwargs,
|
||||
)
|
||||
requires_backends(self, "sacremoses")
|
||||
|
||||
if never_split is None:
|
||||
never_split = self.all_special_tokens
|
||||
if special is None:
|
||||
special = []
|
||||
self.counter = Counter()
|
||||
@ -209,7 +191,6 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
||||
self.lower_case = lower_case
|
||||
self.delimiter = delimiter
|
||||
self.vocab_file = vocab_file
|
||||
self.never_split = never_split
|
||||
self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~'
|
||||
self.punction_without_space_before_pattern = re.compile(rf"[^\s][{self.punctuation_symbols}]")
|
||||
self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern()
|
||||
@ -217,7 +198,8 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
||||
self.moses_punct_normalizer = sm.MosesPunctNormalizer(language)
|
||||
self.moses_tokenizer = sm.MosesTokenizer(language)
|
||||
self.moses_detokenizer = sm.MosesDetokenizer(language)
|
||||
|
||||
self.idx2sym = []
|
||||
self.sym2idx = OrderedDict()
|
||||
# This try... catch... is not beautiful but honestly this tokenizer was not made to be used
|
||||
# in a library like ours, at all.
|
||||
try:
|
||||
@ -241,7 +223,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
||||
|
||||
if vocab_dict is not None:
|
||||
for key, value in vocab_dict.items():
|
||||
if key not in self.__dict__:
|
||||
if key not in self.__dict__ or key == "sym2idx":
|
||||
self.__dict__[key] = value
|
||||
elif vocab_file is not None:
|
||||
self.build_vocab()
|
||||
@ -256,6 +238,27 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
||||
if vocab_file is not None:
|
||||
self.build_vocab()
|
||||
|
||||
super().__init__(
|
||||
special=special,
|
||||
min_freq=min_freq,
|
||||
max_size=max_size,
|
||||
lower_case=lower_case,
|
||||
delimiter=delimiter,
|
||||
vocab_file=vocab_file,
|
||||
pretrained_vocab_file=pretrained_vocab_file,
|
||||
never_split=never_split,
|
||||
unk_token=unk_token,
|
||||
eos_token=eos_token,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
language=language,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# these are not required to initialize the parent class as only used when tokenizing.
|
||||
if never_split is None:
|
||||
never_split = self.all_special_tokens
|
||||
self.never_split = never_split
|
||||
|
||||
@property
|
||||
def do_lower_case(self):
|
||||
return self.lower_case
|
||||
@ -305,7 +308,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
||||
elif "<unk>" in self.sym2idx:
|
||||
self.unk_idx = self.sym2idx["<unk>"]
|
||||
else:
|
||||
raise ValueError("No <unknown> token in vocabulary")
|
||||
raise ValueError("Token not in vocabulary and no <unk> token in vocabulary for replacement.")
|
||||
|
||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||
if os.path.isdir(save_directory):
|
||||
@ -323,7 +326,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
||||
if self.vocab_file:
|
||||
logger.info(f"building vocab from {self.vocab_file}")
|
||||
self._build_from_file(self.vocab_file)
|
||||
logger.info(f"final vocab size {len(self)}")
|
||||
logger.info(f"Final vocab size {len(self.sym2idx)}")
|
||||
else:
|
||||
logger.info(f"building vocab with min_freq={self.min_freq}, max_size={self.max_size}")
|
||||
self.idx2sym = []
|
||||
@ -337,7 +340,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
||||
break
|
||||
self.add_symbol(sym)
|
||||
|
||||
logger.info(f"final vocab size {len(self)} from {len(self.counter)} unique tokens")
|
||||
logger.info(f"Final vocab size {len(self.sym2idx)} from {len(self.counter)} unique tokens")
|
||||
|
||||
@torch_only_method
|
||||
def encode_file(self, path, ordered=False, verbose=False, add_eos=True, add_double_eos=False):
|
||||
@ -406,9 +409,8 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
||||
self.sym2idx[current_sym] = idx
|
||||
|
||||
# Delete token from added_tokens
|
||||
old_index = self.added_tokens_encoder[token]
|
||||
del self.added_tokens_decoder[old_index]
|
||||
del self.added_tokens_encoder[token]
|
||||
old_index = self._added_tokens_encoder.pop(token)
|
||||
self._added_tokens_decoder.pop(old_index)
|
||||
|
||||
def moses_punct_norm(self, text):
|
||||
return self.moses_punct_normalizer.normalize(text)
|
||||
@ -463,7 +465,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
||||
elif "<UNK>" in self.sym2idx:
|
||||
return self.sym2idx["<UNK>"]
|
||||
else:
|
||||
raise ValueError("Token not in vocabulary and no <unk> token in vocabulary for replacement")
|
||||
raise ValueError("Token not in vocabulary and no <unk> token in vocabulary for replacement.")
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
"""
|
||||
@ -482,7 +484,9 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
||||
return len(self.idx2sym)
|
||||
|
||||
def get_vocab(self):
|
||||
return dict(self.sym2idx, **self.added_tokens_encoder)
|
||||
vocab = self.sym2idx.copy()
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def _tokenize(self, line, add_eos=False, add_double_eos=False):
|
||||
line = line.strip()
|
||||
|
@ -93,17 +93,6 @@ class VitsTokenizer(PreTrainedTokenizer):
|
||||
is_uroman=False,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
pad_token=pad_token,
|
||||
unk_token=unk_token,
|
||||
language=language,
|
||||
add_blank=add_blank,
|
||||
normalize=normalize,
|
||||
phonemize=phonemize,
|
||||
is_uroman=is_uroman,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
|
||||
@ -115,12 +104,24 @@ class VitsTokenizer(PreTrainedTokenizer):
|
||||
|
||||
self.is_uroman = is_uroman
|
||||
|
||||
super().__init__(
|
||||
pad_token=pad_token,
|
||||
unk_token=unk_token,
|
||||
language=language,
|
||||
add_blank=add_blank,
|
||||
normalize=normalize,
|
||||
phonemize=phonemize,
|
||||
is_uroman=is_uroman,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return len(self.encoder)
|
||||
|
||||
def get_vocab(self):
|
||||
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def normalize_text(self, input_string):
|
||||
|
@ -24,7 +24,7 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ...tokenization_utils import PreTrainedTokenizer, _insert_one_token_to_ordered_list
|
||||
from ...tokenization_utils import PreTrainedTokenizer
|
||||
from ...tokenization_utils_base import AddedToken, BatchEncoding
|
||||
from ...utils import (
|
||||
ModelOutput,
|
||||
@ -174,18 +174,6 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
|
||||
target_lang=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
pad_token=pad_token,
|
||||
do_lower_case=do_lower_case,
|
||||
word_delimiter_token=word_delimiter_token,
|
||||
replace_word_delimiter_char=replace_word_delimiter_char,
|
||||
target_lang=target_lang,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self._word_delimiter_token = word_delimiter_token
|
||||
|
||||
self.do_lower_case = do_lower_case
|
||||
@ -204,13 +192,28 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
|
||||
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
|
||||
super().__init__(
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
pad_token=pad_token,
|
||||
do_lower_case=do_lower_case,
|
||||
word_delimiter_token=word_delimiter_token,
|
||||
replace_word_delimiter_char=replace_word_delimiter_char,
|
||||
target_lang=target_lang,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# make sure that tokens made of several
|
||||
# characters are not split at tokenization
|
||||
|
||||
# TODO @ArthurZ add them or just update the trie?
|
||||
unique_no_split_tokens = []
|
||||
for token in self.encoder.keys():
|
||||
if len(token) > 1:
|
||||
self.unique_no_split_tokens.append(token)
|
||||
unique_no_split_tokens.append(AddedToken(token, rstrip=True, lstrip=True, normalized=False))
|
||||
|
||||
self._create_trie(self.unique_no_split_tokens)
|
||||
self.add_tokens(unique_no_split_tokens)
|
||||
|
||||
def set_target_lang(self, target_lang: str):
|
||||
"""
|
||||
@ -266,7 +269,20 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
|
||||
return len(self.decoder)
|
||||
|
||||
def get_vocab(self) -> Dict:
|
||||
return dict(self.vocab, **self.added_tokens_encoder)
|
||||
vocab = dict(self.encoder)
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
|
||||
# Overwritten to never strip!
|
||||
to_add = []
|
||||
for token in new_tokens:
|
||||
if isinstance(token, str):
|
||||
to_add.append(AddedToken(token, rstrip=False, lstrip=False, normalize=False))
|
||||
else:
|
||||
to_add.append(token)
|
||||
|
||||
return super()._add_tokens(to_add, special_tokens)
|
||||
|
||||
def _tokenize(self, text, **kwargs):
|
||||
"""
|
||||
@ -645,64 +661,6 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
|
||||
|
||||
return (vocab_file,)
|
||||
|
||||
def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
|
||||
"""
|
||||
Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
|
||||
it with indices starting from length of the current vocabulary.
|
||||
|
||||
Args:
|
||||
new_tokens (`List[str]`or `List[tokenizers.AddedToken]`):
|
||||
Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
|
||||
checking if the tokenizer assign the index of the `unk_token` to them).
|
||||
special_tokens (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the tokens should be added as special tokens.
|
||||
|
||||
Returns:
|
||||
`int`: The number of tokens actually added to the vocabulary.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
# Let's see how to increase the vocabulary of Bert model and tokenizer
|
||||
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
|
||||
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
|
||||
|
||||
num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
|
||||
print("We have added", num_added_toks, "tokens")
|
||||
# Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
```"""
|
||||
new_tokens = [str(tok) for tok in new_tokens]
|
||||
|
||||
tokens_to_add = []
|
||||
for token in new_tokens:
|
||||
assert isinstance(token, str)
|
||||
if not special_tokens and hasattr(self, "do_lower_case") and self.do_lower_case:
|
||||
token = token.lower()
|
||||
if (
|
||||
token != self.unk_token
|
||||
and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
|
||||
and token not in tokens_to_add
|
||||
):
|
||||
tokens_to_add.append(token)
|
||||
if self.verbose:
|
||||
logger.info(f"Adding {token} to the vocabulary")
|
||||
|
||||
added_tok_encoder = {tok: len(self) + i for i, tok in enumerate(tokens_to_add)}
|
||||
added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
|
||||
self.added_tokens_encoder.update(added_tok_encoder)
|
||||
self.added_tokens_decoder.update(added_tok_decoder)
|
||||
|
||||
# Make sure we don't split on any special tokens (even they were already in the vocab before)
|
||||
for token in tokens_to_add:
|
||||
if len(token) > 1:
|
||||
self._additional_special_tokens.append(AddedToken(token))
|
||||
_insert_one_token_to_ordered_list(self.unique_no_split_tokens, token)
|
||||
|
||||
self._create_trie(self.unique_no_split_tokens)
|
||||
|
||||
return len(tokens_to_add)
|
||||
|
||||
|
||||
class Wav2Vec2Tokenizer(PreTrainedTokenizer):
|
||||
"""
|
||||
@ -777,18 +735,6 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer):
|
||||
return_attention_mask=False,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
pad_token=pad_token,
|
||||
do_lower_case=do_lower_case,
|
||||
do_normalize=do_normalize,
|
||||
return_attention_mask=return_attention_mask,
|
||||
word_delimiter_token=word_delimiter_token,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
warnings.warn(
|
||||
"The class `Wav2Vec2Tokenizer` is deprecated and will be removed in version 5 of Transformers. Please use"
|
||||
" `Wav2Vec2Processor` or `Wav2Vec2CTCTokenizer` instead.",
|
||||
@ -806,6 +752,18 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer):
|
||||
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
|
||||
super().__init__(
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
pad_token=pad_token,
|
||||
do_lower_case=do_lower_case,
|
||||
do_normalize=do_normalize,
|
||||
return_attention_mask=return_attention_mask,
|
||||
word_delimiter_token=word_delimiter_token,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def word_delimiter_token(self) -> str:
|
||||
"""
|
||||
|
@ -23,7 +23,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ...tokenization_utils import PreTrainedTokenizer, _insert_one_token_to_ordered_list
|
||||
from ...tokenization_utils import PreTrainedTokenizer
|
||||
from ...tokenization_utils_base import AddedToken
|
||||
from ...utils import (
|
||||
ModelOutput,
|
||||
@ -143,6 +143,18 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
|
||||
phonemizer_backend="espeak",
|
||||
**kwargs,
|
||||
):
|
||||
self._word_delimiter_token = word_delimiter_token
|
||||
self._phone_delimiter_token = phone_delimiter_token
|
||||
self.do_phonemize = do_phonemize
|
||||
self.phonemizer_lang = phonemizer_lang
|
||||
self.phonemizer_backend = phonemizer_backend
|
||||
|
||||
if do_phonemize:
|
||||
self.init_backend(self.phonemizer_lang)
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
super().__init__(
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
@ -156,25 +168,25 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self._word_delimiter_token = word_delimiter_token
|
||||
self._phone_delimiter_token = phone_delimiter_token
|
||||
self.do_phonemize = do_phonemize
|
||||
self.phonemizer_lang = phonemizer_lang
|
||||
self.phonemizer_backend = phonemizer_backend
|
||||
|
||||
if do_phonemize:
|
||||
self.init_backend(self.phonemizer_lang)
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> int:
|
||||
return len(self.decoder)
|
||||
|
||||
def get_vocab(self) -> Dict:
|
||||
return dict(self.encoder, **self.added_tokens_encoder)
|
||||
vocab = dict(self.encoder)
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
|
||||
# Overwritten to never strip!
|
||||
to_add = []
|
||||
for token in new_tokens:
|
||||
if isinstance(token, str):
|
||||
to_add.append(AddedToken(token, rstrip=False, lstrip=False, normalize=True))
|
||||
else:
|
||||
to_add.append(token)
|
||||
|
||||
return super()._add_tokens(to_add, special_tokens)
|
||||
|
||||
def init_backend(self, phonemizer_lang: str):
|
||||
"""
|
||||
@ -576,61 +588,3 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
|
||||
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
|
||||
|
||||
return (vocab_file,)
|
||||
|
||||
def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
|
||||
"""
|
||||
Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
|
||||
it with indices starting from length of the current vocabulary.
|
||||
|
||||
Args:
|
||||
new_tokens (`List[str]`or `List[tokenizers.AddedToken]`):
|
||||
Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
|
||||
checking if the tokenizer assign the index of the `unk_token` to them).
|
||||
special_tokens (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the tokens should be added as special tokens.
|
||||
|
||||
Returns:
|
||||
`int`: The number of tokens actually added to the vocabulary.
|
||||
|
||||
Examples:
|
||||
|
||||
```python
|
||||
# Let's see how to increase the vocabulary of Bert model and tokenizer
|
||||
tokenizer = Wav2Vec2PhonemeCTCTokenizer.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
|
||||
model = Wav2Vec2PhonemeForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
|
||||
|
||||
num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
|
||||
print("We have added", num_added_toks, "tokens")
|
||||
# Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
```"""
|
||||
new_tokens = [str(tok) for tok in new_tokens]
|
||||
|
||||
tokens_to_add = []
|
||||
for token in new_tokens:
|
||||
if not isinstance(token, str):
|
||||
raise ValueError(f"Token {token} has to be of type string, but is of type {type(token)}.")
|
||||
assert isinstance(token, str)
|
||||
if (
|
||||
token != self.unk_token
|
||||
and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
|
||||
and token not in tokens_to_add
|
||||
):
|
||||
tokens_to_add.append(token)
|
||||
if self.verbose:
|
||||
logger.info(f"Adding {token} to the vocabulary")
|
||||
|
||||
added_tok_encoder = {tok: len(self) + i for i, tok in enumerate(tokens_to_add)}
|
||||
added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
|
||||
self.added_tokens_encoder.update(added_tok_encoder)
|
||||
self.added_tokens_decoder.update(added_tok_decoder)
|
||||
|
||||
# Make sure we don't split on any special tokens (even they were already in the vocab before)
|
||||
for token in tokens_to_add:
|
||||
if len(token) > 1:
|
||||
self._additional_special_tokens.append(AddedToken(token))
|
||||
_insert_one_token_to_ordered_list(self.unique_no_split_tokens, token)
|
||||
|
||||
self._create_trie(self.unique_no_split_tokens)
|
||||
|
||||
return len(tokens_to_add)
|
||||
|
@ -272,18 +272,25 @@ class WhisperTokenizer(PreTrainedTokenizer):
|
||||
predict_timestamps=False,
|
||||
**kwargs,
|
||||
):
|
||||
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
|
||||
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
||||
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
||||
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
pad_token=pad_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
**kwargs,
|
||||
bos_token = (
|
||||
AddedToken(bos_token, lstrip=False, rstrip=False, normalized=False, special=True)
|
||||
if isinstance(bos_token, str)
|
||||
else bos_token
|
||||
)
|
||||
eos_token = (
|
||||
AddedToken(eos_token, lstrip=False, rstrip=False, normalized=False, special=True)
|
||||
if isinstance(eos_token, str)
|
||||
else eos_token
|
||||
)
|
||||
unk_token = (
|
||||
AddedToken(unk_token, lstrip=False, rstrip=False, normalized=False, special=True)
|
||||
if isinstance(unk_token, str)
|
||||
else unk_token
|
||||
)
|
||||
pad_token = (
|
||||
AddedToken(pad_token, lstrip=False, rstrip=False, normalized=False, special=True)
|
||||
if isinstance(pad_token, str)
|
||||
else pad_token
|
||||
)
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
@ -309,18 +316,28 @@ class WhisperTokenizer(PreTrainedTokenizer):
|
||||
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
|
||||
|
||||
self.language = language
|
||||
super().__init__(
|
||||
errors=errors,
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
pad_token=pad_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.task = task
|
||||
self.predict_timestamps = predict_timestamps
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> int:
|
||||
return len(self.encoder)
|
||||
|
||||
def get_vocab(self):
|
||||
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> int:
|
||||
return len(self.encoder)
|
||||
|
||||
# Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe with GPT2 -> Whisper
|
||||
def bpe(self, token):
|
||||
if token in self.cache:
|
||||
@ -390,11 +407,10 @@ class WhisperTokenizer(PreTrainedTokenizer):
|
||||
|
||||
@property
|
||||
def prefix_tokens(self) -> List[int]:
|
||||
all_special_ids = self.all_special_ids
|
||||
bos_token_id = all_special_ids[-106]
|
||||
translate_token_id = all_special_ids[-6]
|
||||
transcribe_token_id = all_special_ids[-5]
|
||||
notimestamps_token_id = all_special_ids[-1]
|
||||
bos_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
|
||||
translate_token_id = self.convert_tokens_to_ids("<|translate|>")
|
||||
transcribe_token_id = self.convert_tokens_to_ids("<|transcribe|>")
|
||||
notimestamps_token_id = self.convert_tokens_to_ids("<|notimestamps|>")
|
||||
langs = tuple(LANGUAGES.keys())
|
||||
|
||||
if self.language is not None:
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user