mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
fix camembert and XLM-R tokenizer
This commit is contained in:
parent
ceae85ad60
commit
e37ca8e11a
@ -22,6 +22,7 @@ from shutil import copyfile
|
||||
|
||||
import sentencepiece as spm
|
||||
from transformers.tokenization_utils import PreTrainedTokenizer
|
||||
from .tokenization_xlnet import SPIECE_UNDERLINE
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -145,6 +146,11 @@ class CamembertTokenizer(PreTrainedTokenizer):
|
||||
return self.fairseq_ids_to_tokens[index]
|
||||
return self.sp_model.IdToPiece(index - self.fairseq_offset)
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
|
||||
out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
|
||||
return out_string
|
||||
|
||||
def save_vocabulary(self, save_directory):
|
||||
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
|
||||
to a directory.
|
||||
|
@ -22,6 +22,7 @@ from shutil import copyfile
|
||||
|
||||
import sentencepiece as spm
|
||||
from transformers.tokenization_utils import PreTrainedTokenizer
|
||||
from .tokenization_xlnet import SPIECE_UNDERLINE
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -161,6 +162,11 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
|
||||
return self.fairseq_ids_to_tokens[index]
|
||||
return self.sp_model.IdToPiece(index - self.fairseq_offset)
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
|
||||
out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
|
||||
return out_string
|
||||
|
||||
def save_vocabulary(self, save_directory):
|
||||
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
|
||||
to a directory.
|
||||
|
Loading…
Reference in New Issue
Block a user