Add sudachi_projection option to BertJapaneseTokenizer (#28503)

* add sudachi_projection option * Upgrade sudachipy>=0.6.8 * add a test case for sudachi_projection * Compatible with older versions of SudachiPy * make fixup * make style * error message for unidic download * revert jumanpp test cases * format options for sudachi_projection Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * format options for sudachi_split_mode and sudachi_dict_type * comment * add tests for full_tokenizer kwargs * pass projection arg directly * require_sudachi_projection * make style * revert upgrade sudachipy * check is_sudachi_projection_available() * revert dependency_version_table and bugfix * style format * simply raise ImportError Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * simply raise ImportError --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
2025-07-31 10:12:23 +06:00 · 2024-02-13 03:47:20 +00:00 · 2024-02-13 03:47:20 +00:00 · da20209dbc
commit da20209dbc
parent b44567538b
5 changed files with 109 additions and 15 deletions
--- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
+++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
@ -22,7 +22,7 @@ import unicodedata
 from typing import Any, Dict, List, Optional, Tuple

 from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
-from ...utils import is_sentencepiece_available, logging
+from ...utils import is_sentencepiece_available, is_sudachi_projection_available, logging


 if is_sentencepiece_available():
@ -542,6 +542,7 @@ class SudachiTokenizer:
        sudachi_config_path=None,
        sudachi_resource_dir=None,
        sudachi_dict_type="core",
+        sudachi_projection=None,
    ):
        """
        Constructs a SudachiTokenizer.
@ -557,11 +558,13 @@ class SudachiTokenizer:
            **trim_whitespace**: (*optional*) boolean (default False)
                Whether to trim all whitespace, tab, newline from tokens.
            **sudachi_split_mode**: (*optional*) string
-                Split mode of sudachi, choose from "A", "B", "C".
+                Split mode of sudachi, choose from `["A", "B", "C"]`.
            **sudachi_config_path**: (*optional*) string
            **sudachi_resource_dir**: (*optional*) string
            **sudachi_dict_type**: (*optional*) string
-                dict type of sudachi, choose from "small", "core", "full".
+                dict type of sudachi, choose from `["small", "core", "full"]`.
+            **sudachi_projection**: (*optional*) string
+                Word projection mode of sudachi, choose from `["surface", "normalized", "reading", "dictionary", "dictionary_and_surface", "normalized_and_surface", "normalized_nouns"]`.
        """

        self.do_lower_case = do_lower_case
@ -586,9 +589,17 @@ class SudachiTokenizer:
        else:
            raise ValueError("Invalid sudachi_split_mode is specified.")

-        self.sudachi = dictionary.Dictionary(
+        self.projection = sudachi_projection
+
+        sudachi_dictionary = dictionary.Dictionary(
            config_path=sudachi_config_path, resource_dir=sudachi_resource_dir, dict=sudachi_dict_type
-        ).create(self.split_mode)
+        )
+        if is_sudachi_projection_available():
+            self.sudachi = sudachi_dictionary.create(self.split_mode, projection=self.projection)
+        elif self.projection is not None:
+            raise ImportError("You need to install sudachipy>=0.6.8 to specify `projection` field in sudachi_kwargs.")
+        else:
+            self.sudachi = sudachi_dictionary.create(self.split_mode)

    def tokenize(self, text, never_split=None, **kwargs):
        """Tokenizes a piece of text."""
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@ -95,6 +95,7 @@ from .utils import (
    is_soundfile_availble,
    is_spacy_available,
    is_sudachi_available,
+    is_sudachi_projection_available,
    is_tensorflow_probability_available,
    is_tensorflow_text_available,
    is_tf2onnx_available,
@ -1043,6 +1044,15 @@ def require_sudachi(test_case):
    return unittest.skipUnless(is_sudachi_available(), "test requires sudachi")(test_case)


+def require_sudachi_projection(test_case):
+    """
+    Decorator marking a test that requires sudachi_projection
+    """
+    return unittest.skipUnless(is_sudachi_projection_available(), "test requires sudachi which supports projection")(
+        test_case
+    )
+
+
 def require_jumanpp(test_case):
    """
    Decorator marking a test that requires jumanpp
--- a/src/transformers/utils/init.py
+++ b/src/transformers/utils/init.py
@ -163,6 +163,7 @@ from .import_utils import (
    is_spacy_available,
    is_speech_available,
    is_sudachi_available,
+    is_sudachi_projection_available,
    is_tensorflow_probability_available,
    is_tensorflow_text_available,
    is_tf2onnx_available,
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@ -135,7 +135,7 @@ if _sklearn_available:
 _smdistributed_available = importlib.util.find_spec("smdistributed") is not None
 _soundfile_available = _is_package_available("soundfile")
 _spacy_available = _is_package_available("spacy")
-_sudachipy_available = _is_package_available("sudachipy")
+_sudachipy_available, _sudachipy_version = _is_package_available("sudachipy", return_version=True)
 _tensorflow_probability_available = _is_package_available("tensorflow_probability")
 _tensorflow_text_available = _is_package_available("tensorflow_text")
 _tf2onnx_available = _is_package_available("tf2onnx")
@ -896,6 +896,19 @@ def is_sudachi_available():
    return _sudachipy_available


+def get_sudachi_version():
+    return _sudachipy_version
+
+
+def is_sudachi_projection_available():
+    if not is_sudachi_available():
+        return False
+
+    # NOTE: We require sudachipy>=0.6.8 to use projection option in sudachi_kwargs for the constructor of BertJapaneseTokenizer.
+    # - `projection` option is not supported in sudachipy<0.6.8, see https://github.com/WorksApplications/sudachi.rs/issues/230
+    return version.parse(_sudachipy_version) >= version.parse("0.6.8")
+
+
 def is_jumanpp_available():
    return (importlib.util.find_spec("rhoknp") is not None) and (shutil.which("jumanpp") is not None)

--- a/tests/models/bert_japanese/test_tokenization_bert_japanese.py
+++ b/tests/models/bert_japanese/test_tokenization_bert_japanese.py
@ -29,7 +29,7 @@ from transformers.models.bert_japanese.tokenization_bert_japanese import (
    SudachiTokenizer,
    WordpieceTokenizer,
 )
-from transformers.testing_utils import custom_tokenizers, require_jumanpp, require_sudachi
+from transformers.testing_utils import custom_tokenizers, require_jumanpp, require_sudachi_projection

 from ...test_tokenization_common import TokenizerTesterMixin

@ -60,6 +60,15 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
            "##、",
            "。",
            "##。",
+            "アップルストア",
+            "外国",
+            "##人",
+            "参政",
+            "##権",
+            "此れ",
+            "は",
+            "猫",
+            "です",
        ]

        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
@ -113,6 +122,15 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):

        self.assertListEqual(tokens, tokens_loaded)

+    def test_mecab_full_tokenizer_with_mecab_kwargs(self):
+        tokenizer = self.tokenizer_class(
+            self.vocab_file, word_tokenizer_type="mecab", mecab_kwargs={"mecab_dic": "ipadic"}
+        )
+
+        text = "ｱｯﾌﾟﾙストア"
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, ["アップルストア"])
+
    def test_mecab_tokenizer_ipadic(self):
        tokenizer = MecabTokenizer(mecab_dic="ipadic")

@ -134,6 +152,12 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):

    def test_mecab_tokenizer_unidic(self):
        try:
+            import unidic
+
+            self.assertTrue(
+                os.path.isdir(unidic.DICDIR),
+                "The content of unidic was not downloaded. Run `python -m unidic download` before running this test case. Note that this requires 2.1GB on disk.",
+            )
            tokenizer = MecabTokenizer(mecab_dic="unidic")
        except ModuleNotFoundError:
            return
@ -173,7 +197,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
            ["ｱｯﾌﾟﾙストア", "で", "iPhone", "８", "が", "発売", "さ", "れ", "た", "　", "。"],
        )

-    @require_sudachi
+    @require_sudachi_projection
    def test_pickle_sudachi_tokenizer(self):
        tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="sudachi")
        self.assertIsNotNone(tokenizer)
@ -194,7 +218,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):

        self.assertListEqual(tokens, tokens_loaded)

-    @require_sudachi
+    @require_sudachi_projection
    def test_sudachi_tokenizer_core(self):
        tokenizer = SudachiTokenizer(sudachi_dict_type="core")

@ -205,37 +229,61 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        )
        # fmt: on

-    @require_sudachi
+    @require_sudachi_projection
    def test_sudachi_tokenizer_split_mode_A(self):
        tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="A")

        self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国", "人", "参政", "権"])

-    @require_sudachi
+    @require_sudachi_projection
    def test_sudachi_tokenizer_split_mode_B(self):
        tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="B")

        self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国人", "参政権"])

-    @require_sudachi
+    @require_sudachi_projection
    def test_sudachi_tokenizer_split_mode_C(self):
        tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="C")

        self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国人参政権"])

-    @require_sudachi
+    @require_sudachi_projection
+    def test_sudachi_full_tokenizer_with_sudachi_kwargs_split_mode_B(self):
+        tokenizer = self.tokenizer_class(
+            self.vocab_file, word_tokenizer_type="sudachi", sudachi_kwargs={"sudachi_split_mode": "B"}
+        )
+
+        self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国", "##人", "参政", "##権"])
+
+    @require_sudachi_projection
+    def test_sudachi_tokenizer_projection(self):
+        tokenizer = SudachiTokenizer(
+            sudachi_dict_type="core", sudachi_split_mode="A", sudachi_projection="normalized_nouns"
+        )
+
+        self.assertListEqual(tokenizer.tokenize("これはねこです。"), ["此れ", "は", "猫", "です", "。"])
+
+    @require_sudachi_projection
+    def test_sudachi_full_tokenizer_with_sudachi_kwargs_sudachi_projection(self):
+        tokenizer = self.tokenizer_class(
+            self.vocab_file, word_tokenizer_type="sudachi", sudachi_kwargs={"sudachi_projection": "normalized_nouns"}
+        )
+
+        self.assertListEqual(tokenizer.tokenize("これはねこです。"), ["此れ", "は", "猫", "です", "。"])
+
+    @require_sudachi_projection
    def test_sudachi_tokenizer_lower(self):
        tokenizer = SudachiTokenizer(do_lower_case=True, sudachi_dict_type="core")

        self.assertListEqual(tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),[" ", "\t", "アップル", "ストア", "で", "iphone", "8", " ", "が", " ", " ", "\n ", "発売", "さ", "れ", "た", " ", "。", " ", " "])  # fmt: skip

-    @require_sudachi
+    @require_sudachi_projection
    def test_sudachi_tokenizer_no_normalize(self):
        tokenizer = SudachiTokenizer(normalize_text=False, sudachi_dict_type="core")

        self.assertListEqual(tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),[" ", "\t", "ｱｯﾌﾟﾙ", "ストア", "で", "iPhone", "８", " ", "が", " ", " ", "\n ", "発売", "さ", "れ", "た", "\u3000", "。", " ", " "])  # fmt: skip

-    @require_sudachi
+    @require_sudachi_projection
    def test_sudachi_tokenizer_trim_whitespace(self):
        tokenizer = SudachiTokenizer(trim_whitespace=True, sudachi_dict_type="core")

@ -293,6 +341,17 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
            ["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れた", "。"],
        )

+    @require_jumanpp
+    def test_jumanpp_full_tokenizer_with_jumanpp_kwargs_trim_whitespace(self):
+        tokenizer = self.tokenizer_class(
+            self.vocab_file, word_tokenizer_type="jumanpp", jumanpp_kwargs={"trim_whitespace": True}
+        )
+
+        text = "こんにちは、世界。\nこんばんは、世界。"
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
+
    @require_jumanpp
    def test_jumanpp_tokenizer_ext(self):
        tokenizer = JumanppTokenizer()