mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-03 12:50:06 +06:00
add a warning in SpmConverter
for sentencepiece's model using the byte fallback feature (#16629)
* update proto sentencepiece model * Revert "update proto sentencepiece model" This reverts commitb07f671747
. * add check * add test * Revert "Revert "update proto sentencepiece model"" This reverts commit46108257b8
. * test for log level * test for log level 2 * warning at the warning level * clean * format * add explanation in docstring
This commit is contained in:
parent
7c5d79912a
commit
1025a9b742
@ -19,6 +19,7 @@ All the conversions are grouped here to gather SentencePiece dependencies outsid
|
||||
allow to make our dependency on SentencePiece optional.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
|
||||
@ -429,6 +430,14 @@ class SpmConverter(Converter):
|
||||
m.ParseFromString(f.read())
|
||||
self.proto = m
|
||||
|
||||
if self.proto.trainer_spec.byte_fallback:
|
||||
warnings.warn(
|
||||
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
|
||||
" which is not implemented in the fast tokenizers. In practice this means that the fast version of the"
|
||||
" tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these "
|
||||
"unknown tokens into a sequence of byte tokens matching the original piece of text."
|
||||
)
|
||||
|
||||
def vocab(self, proto):
|
||||
return [(piece.piece, piece.score) for piece in proto.pieces]
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
BIN
tests/fixtures/test_sentencepiece_with_bytefallback.model
vendored
Normal file
BIN
tests/fixtures/test_sentencepiece_with_bytefallback.model
vendored
Normal file
Binary file not shown.
36
tests/utils/test_convert_slow_tokenizer.py
Normal file
36
tests/utils/test_convert_slow_tokenizer.py
Normal file
@ -0,0 +1,36 @@
|
||||
import unittest
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
|
||||
from transformers.convert_slow_tokenizer import SpmConverter
|
||||
from transformers.testing_utils import get_tests_dir
|
||||
|
||||
|
||||
@dataclass
|
||||
class FakeOriginalTokenizer:
|
||||
vocab_file: str
|
||||
|
||||
|
||||
class ConvertSlowTokenizerTest(unittest.TestCase):
|
||||
def test_spm_converter_bytefallback_warning(self):
|
||||
spm_model_file_without_bytefallback = f"{get_tests_dir()}/fixtures/test_sentencepiece.model"
|
||||
spm_model_file_with_bytefallback = f"{get_tests_dir()}/fixtures/test_sentencepiece_with_bytefallback.model"
|
||||
|
||||
original_tokenizer_without_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_without_bytefallback)
|
||||
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
_ = SpmConverter(original_tokenizer_without_bytefallback)
|
||||
self.assertEqual(len(w), 0)
|
||||
|
||||
original_tokenizer_with_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_with_bytefallback)
|
||||
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
_ = SpmConverter(original_tokenizer_with_bytefallback)
|
||||
self.assertEqual(len(w), 1)
|
||||
self.assertIn(
|
||||
(
|
||||
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
|
||||
" which is not implemented in the fast tokenizers."
|
||||
),
|
||||
str(w[0].message),
|
||||
)
|
Loading…
Reference in New Issue
Block a user