Support additional dictionaries for BERT Japanese tokenizers (#6515)

* Update BERT Japanese tokenizers * Update CircleCI config to download unidic * Specify to use the latest dictionary packages
2025-07-29 01:02:25 +06:00 · 2020-08-17 13:00:23 +09:00 · 2020-08-17 13:00:23 +09:00 · 48c6c6139f
commit 48c6c6139f
parent 423eb5b1d7
4 changed files with 97 additions and 15 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -150,6 +150,7 @@ jobs:
                      - v0.3-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
            - run: pip install .[ja,testing]
            - run: python -m unidic download
            - save_cache:
                  key: v0.3-custom_tokenizers-{{ checksum "setup.py" }}
                  paths:
--- a/setup.py
+++ b/setup.py
@ -65,7 +65,7 @@ if stale_egg_info.exists():
 extras = {}
-extras["ja"] = ["fugashi>=1.0", "ipadic>=1.0,<2.0"]
+extras["ja"] = ["fugashi>=1.0", "ipadic>=1.0.0,<2.0", "unidic_lite>=1.0.7", "unidic>=1.0.2"]
 extras["sklearn"] = ["scikit-learn"]
 # keras2onnx and onnxconverter-common version is specific through a commit until 1.7.0 lands on pypi
--- a/src/transformers/tokenization_bert_japanese.py
+++ b/src/transformers/tokenization_bert_japanese.py
@ -167,30 +167,89 @@ class BertJapaneseTokenizer(BertTokenizer):
 class MecabTokenizer:
    """Runs basic tokenization with MeCab morphological parser."""
-    def __init__(self, do_lower_case=False, never_split=None, normalize_text=True, mecab_option: Optional[str] = None):
+    def __init__(
        self,
        do_lower_case=False,
        never_split=None,
        normalize_text=True,
        mecab_dic: Optional[str] = "ipadic",
        mecab_option: Optional[str] = None,
    ):
        """Constructs a MecabTokenizer.
        Args:
            **do_lower_case**: (`optional`) boolean (default True)
-                Whether to lower case the input.
+                Whether to lowercase the input.
            **never_split**: (`optional`) list of str
                Kept for backward compatibility purposes.
                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
-                List of token not to split.
+                List of tokens not to split.
            **normalize_text**: (`optional`) boolean (default True)
                Whether to apply unicode normalization to text before tokenization.
-            **mecab_option**: (`optional`) string passed to `MeCab.Tagger` constructor (default "")
+            **mecab_dic**: (`optional`) string (default "ipadic")
                Name of dictionary to be used for MeCab initialization.
                If you are using a system-installed dictionary, set thi option to `None` and modify `mecab_option`.
            **mecab_option**: (`optional`) string
                String passed to MeCab constructor.
        """
        self.do_lower_case = do_lower_case
        self.never_split = never_split if never_split is not None else []
        self.normalize_text = normalize_text
-        import fugashi
+        try:
-        import ipadic
+            import fugashi
        except ModuleNotFoundError as error:
            raise error(
                "You need to install fugashi to use MecabTokenizer."
                "See https://pypi.org/project/fugashi/ for installation."
            )
        # Use ipadic by default (later options can override it)
        mecab_option = mecab_option or ""
-        mecab_option = ipadic.MECAB_ARGS + " " + mecab_option
+
        if mecab_dic is not None:
            if mecab_dic == "ipadic":
                try:
                    import ipadic
                except ModuleNotFoundError as error:
                    raise error(
                        "The ipadic dictionary is not installed. "
                        "See https://github.com/polm/ipadic-py for installation."
                    )
                dic_dir = ipadic.DICDIR
            elif mecab_dic == "unidic_lite":
                try:
                    import unidic_lite
                except ModuleNotFoundError as error:
                    raise error(
                        "The unidic_lite dictionary is not installed. "
                        "See https://github.com/polm/unidic-lite for installation."
                    )
                dic_dir = unidic_lite.DICDIR
            elif mecab_dic == "unidic":
                try:
                    import unidic
                except ModuleNotFoundError as error:
                    raise error(
                        "The unidic dictionary is not installed. "
                        "See https://github.com/polm/unidic-py for installation."
                    )
                dic_dir = unidic.DICDIR
                if not os.path.isdir(dic_dir):
                    raise RuntimeError(
                        "The unidic dictionary itself is not found."
                        "See https://github.com/polm/unidic-py for installation."
                    )
            else:
                raise ValueError("Invalid mecab_dic is specified.")
            mecabrc = os.path.join(dic_dir, "mecabrc")
            mecab_option = "-d {} -r {} ".format(dic_dir, mecabrc) + mecab_option
        self.mecab = fugashi.GenericTagger(mecab_option)
@ -213,7 +272,7 @@ class MecabTokenizer:
        return tokens
-class CharacterTokenizer(object):
+class CharacterTokenizer:
    """Runs Character tokenziation."""
    def __init__(self, vocab, unk_token, normalize_text=True):
@ -247,7 +306,7 @@ class CharacterTokenizer(object):
            text = unicodedata.normalize("NFKC", text)
        output_tokens = []
-        for i, char in enumerate(text):
+        for char in text:
            if char not in self.vocab:
                output_tokens.append(self.unk_token)
                continue
--- a/tests/test_tokenization_bert_japanese.py
+++ b/tests/test_tokenization_bert_japanese.py
@ -87,16 +87,38 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
-    def test_mecab_tokenizer(self):
+    def test_mecab_tokenizer_ipadic(self):
-        tokenizer = MecabTokenizer()
+        tokenizer = MecabTokenizer(mecab_dic="ipadic")
        self.assertListEqual(
            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
            ["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],
        )
    def test_mecab_tokenizer_unidic_lite(self):
        try:
            tokenizer = MecabTokenizer(mecab_dic="unidic_lite")
        except ModuleNotFoundError:
            return
        self.assertListEqual(
            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
            ["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],
        )
    def test_mecab_tokenizer_unidic(self):
        try:
            tokenizer = MecabTokenizer(mecab_dic="unidic")
        except ModuleNotFoundError:
            return
        self.assertListEqual(
            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
            ["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],
        )
    def test_mecab_tokenizer_lower(self):
-        tokenizer = MecabTokenizer(do_lower_case=True)
+        tokenizer = MecabTokenizer(do_lower_case=True, mecab_dic="ipadic")
        self.assertListEqual(
            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
@ -118,7 +140,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        )
    def test_mecab_tokenizer_no_normalize(self):
-        tokenizer = MecabTokenizer(normalize_text=False)
+        tokenizer = MecabTokenizer(normalize_text=False, mecab_dic="ipadic")
        self.assertListEqual(
            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),