add a warning in SpmConverter for sentencepiece's model using the byte fallback feature (#16629)

* update proto sentencepiece model

* Revert "update proto sentencepiece model"

This reverts commit b07f671747.

* add check

* add test

* Revert "Revert "update proto sentencepiece model""

This reverts commit 46108257b8.

* test for log level

* test for log level 2

* warning at the warning level

* clean

* format

* add explanation in docstring
This commit is contained in:
SaulLu 2022-04-11 11:06:10 +02:00 committed by GitHub
parent 7c5d79912a
commit 1025a9b742
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 486 additions and 158 deletions

View File

@ -19,6 +19,7 @@ All the conversions are grouped here to gather SentencePiece dependencies outsid
allow to make our dependency on SentencePiece optional. allow to make our dependency on SentencePiece optional.
""" """
import warnings
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
@ -429,6 +430,14 @@ class SpmConverter(Converter):
m.ParseFromString(f.read()) m.ParseFromString(f.read())
self.proto = m self.proto = m
if self.proto.trainer_spec.byte_fallback:
warnings.warn(
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
" which is not implemented in the fast tokenizers. In practice this means that the fast version of the"
" tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these "
"unknown tokens into a sequence of byte tokens matching the original piece of text."
)
def vocab(self, proto): def vocab(self, proto):
return [(piece.piece, piece.score) for piece in proto.pieces] return [(piece.piece, piece.score) for piece in proto.pieces]

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -0,0 +1,36 @@
import unittest
import warnings
from dataclasses import dataclass
from transformers.convert_slow_tokenizer import SpmConverter
from transformers.testing_utils import get_tests_dir
@dataclass
class FakeOriginalTokenizer:
vocab_file: str
class ConvertSlowTokenizerTest(unittest.TestCase):
def test_spm_converter_bytefallback_warning(self):
spm_model_file_without_bytefallback = f"{get_tests_dir()}/fixtures/test_sentencepiece.model"
spm_model_file_with_bytefallback = f"{get_tests_dir()}/fixtures/test_sentencepiece_with_bytefallback.model"
original_tokenizer_without_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_without_bytefallback)
with warnings.catch_warnings(record=True) as w:
_ = SpmConverter(original_tokenizer_without_bytefallback)
self.assertEqual(len(w), 0)
original_tokenizer_with_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_with_bytefallback)
with warnings.catch_warnings(record=True) as w:
_ = SpmConverter(original_tokenizer_with_bytefallback)
self.assertEqual(len(w), 1)
self.assertIn(
(
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
" which is not implemented in the fast tokenizers."
),
str(w[0].message),
)