transformers/tests/utils/test_convert_slow_tokenizer.py
SaulLu 1025a9b742
add a warning in SpmConverter for sentencepiece's model using the byte fallback feature (#16629)
* update proto sentencepiece model

* Revert "update proto sentencepiece model"

This reverts commit b07f671747.

* add check

* add test

* Revert "Revert "update proto sentencepiece model""

This reverts commit 46108257b8.

* test for log level

* test for log level 2

* warning at the warning level

* clean

* format

* add explanation in docstring
2022-04-11 11:06:10 +02:00

37 lines
1.4 KiB
Python

import unittest
import warnings
from dataclasses import dataclass
from transformers.convert_slow_tokenizer import SpmConverter
from transformers.testing_utils import get_tests_dir
@dataclass
class FakeOriginalTokenizer:
vocab_file: str
class ConvertSlowTokenizerTest(unittest.TestCase):
def test_spm_converter_bytefallback_warning(self):
spm_model_file_without_bytefallback = f"{get_tests_dir()}/fixtures/test_sentencepiece.model"
spm_model_file_with_bytefallback = f"{get_tests_dir()}/fixtures/test_sentencepiece_with_bytefallback.model"
original_tokenizer_without_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_without_bytefallback)
with warnings.catch_warnings(record=True) as w:
_ = SpmConverter(original_tokenizer_without_bytefallback)
self.assertEqual(len(w), 0)
original_tokenizer_with_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_with_bytefallback)
with warnings.catch_warnings(record=True) as w:
_ = SpmConverter(original_tokenizer_with_bytefallback)
self.assertEqual(len(w), 1)
self.assertIn(
(
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
" which is not implemented in the fast tokenizers."
),
str(w[0].message),
)