add a warning in SpmConverter for sentencepiece's model using the byte fallback feature (#16629)

* update proto sentencepiece model * Revert "update proto sentencepiece model" This reverts commit b07f671747. * add check * add test * Revert "Revert "update proto sentencepiece model"" This reverts commit 46108257b8. * test for log level * test for log level 2 * warning at the warning level * clean * format * add explanation in docstring
2025-07-03 12:50:06 +06:00 · 2022-04-11 11:06:10 +02:00 · 2022-04-11 11:06:10 +02:00 · 1025a9b742
commit 1025a9b742
parent 7c5d79912a
4 changed files with 486 additions and 158 deletions
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@ -19,6 +19,7 @@ All the conversions are grouped here to gather SentencePiece dependencies outsid
 allow to make our dependency on SentencePiece optional.
 """
 import warnings
 from typing import Dict, List, Tuple
 from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
@ -429,6 +430,14 @@ class SpmConverter(Converter):
            m.ParseFromString(f.read())
        self.proto = m
        if self.proto.trainer_spec.byte_fallback:
            warnings.warn(
                "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
                " which is not implemented in the fast tokenizers. In practice this means that the fast version of the"
                " tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these "
                "unknown tokens into a sequence of byte tokens matching the original piece of text."
            )
    def vocab(self, proto):
        return [(piece.piece, piece.score) for piece in proto.pieces]
--- a/src/transformers/utils/sentencepiece_model_pb2.py
+++ b/src/transformers/utils/sentencepiece_model_pb2.py
--- a/tests/fixtures/test_sentencepiece_with_bytefallback.model
+++ b/tests/fixtures/test_sentencepiece_with_bytefallback.model
--- a/tests/utils/test_convert_slow_tokenizer.py
+++ b/tests/utils/test_convert_slow_tokenizer.py
@ -0,0 +1,36 @@
 import unittest
 import warnings
 from dataclasses import dataclass
 from transformers.convert_slow_tokenizer import SpmConverter
 from transformers.testing_utils import get_tests_dir
@dataclass
 class FakeOriginalTokenizer:
    vocab_file: str
 class ConvertSlowTokenizerTest(unittest.TestCase):
    def test_spm_converter_bytefallback_warning(self):
        spm_model_file_without_bytefallback = f"{get_tests_dir()}/fixtures/test_sentencepiece.model"
        spm_model_file_with_bytefallback = f"{get_tests_dir()}/fixtures/test_sentencepiece_with_bytefallback.model"
        original_tokenizer_without_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_without_bytefallback)
        with warnings.catch_warnings(record=True) as w:
            _ = SpmConverter(original_tokenizer_without_bytefallback)
        self.assertEqual(len(w), 0)
        original_tokenizer_with_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_with_bytefallback)
        with warnings.catch_warnings(record=True) as w:
            _ = SpmConverter(original_tokenizer_with_bytefallback)
        self.assertEqual(len(w), 1)
        self.assertIn(
            (
                "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
                " which is not implemented in the fast tokenizers."
            ),
            str(w[0].message),
        )