mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
Enable option for subword regularization in XLMRobertaTokenizer
(#11149)
* enable subword regularization. * fix tokenizer storage * fix docstring formatting * Update src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py Co-authored-by: Stefan Schweter <stefan@schweter.it> * fix docstring formatting * add test for subword regularization tokenizer * improve comments of test * add sp_model_kwargs * reformat docstring to match the style * add some more documentation * Update src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * improve docstring * empty commit to trigger CI * Update src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * fix docstring formatting for sphinx Co-authored-by: Stefan Schweter <stefan@schweter.it> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
parent
1ef152eb48
commit
195bfd118a
@ -94,6 +94,20 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
|
||||
modeling. This is the token which the model will try to predict.
|
||||
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
||||
Additional special tokens used by the tokenizer.
|
||||
sp_model_kwargs (:obj:`dict`, `optional`, defaults to :obj:`None`):
|
||||
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
|
||||
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
|
||||
|
||||
- ``enable_sampling``: Enable subword regularization.
|
||||
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
|
||||
|
||||
- ``nbest_size = {0,1}``: No sampling is performed.
|
||||
- ``nbest_size > 1``: samples from the nbest_size results.
|
||||
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
|
||||
using forward-filtering-and-backward-sampling algorithm.
|
||||
|
||||
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
|
||||
BPE-dropout.
|
||||
|
||||
Attributes:
|
||||
sp_model (:obj:`SentencePieceProcessor`):
|
||||
@ -115,11 +129,14 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
|
||||
unk_token="<unk>",
|
||||
pad_token="<pad>",
|
||||
mask_token="<mask>",
|
||||
sp_model_kwargs=None,
|
||||
**kwargs
|
||||
):
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
@ -128,10 +145,11 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
sp_model_kwargs=sp_model_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor()
|
||||
self.sp_model = spm.SentencePieceProcessor(**sp_model_kwargs)
|
||||
self.sp_model.Load(str(vocab_file))
|
||||
self.vocab_file = vocab_file
|
||||
|
||||
@ -249,7 +267,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
|
||||
return vocab
|
||||
|
||||
def _tokenize(self, text):
|
||||
return self.sp_model.EncodeAsPieces(text)
|
||||
return self.sp_model.encode(text, out_type=str)
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
""" Converts a token (str) in an id using the vocab. """
|
||||
|
@ -14,6 +14,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import itertools
|
||||
import os
|
||||
import unittest
|
||||
|
||||
@ -118,6 +119,29 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
],
|
||||
)
|
||||
|
||||
def test_subword_regularization_tokenizer(self):
|
||||
# Subword regularization is only available for the slow tokenizer.
|
||||
tokenizer = XLMRobertaTokenizer(
|
||||
SAMPLE_VOCAB, keep_accents=True, sp_model_kwargs={"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
|
||||
)
|
||||
|
||||
# Subword regularization augments training data with subword sampling.
|
||||
# This has a random component. We test if the tokenizer generates different
|
||||
# results when subword regularization is enabled.
|
||||
tokens_list = []
|
||||
for _ in range(5):
|
||||
tokens_list.append(tokenizer.tokenize("This is a test for subword regularization."))
|
||||
|
||||
# the list of different pairs of tokens_list
|
||||
combinations = itertools.combinations(tokens_list, 2)
|
||||
|
||||
all_equal = True
|
||||
for combination in combinations:
|
||||
if combination[0] != combination[1]:
|
||||
all_equal = False
|
||||
|
||||
self.assertFalse(all_equal)
|
||||
|
||||
@cached_property
|
||||
def big_tokenizer(self):
|
||||
return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
|
||||
|
Loading…
Reference in New Issue
Block a user