diff --git a/.circleci/config.yml b/.circleci/config.yml index ef84380ea51..56a2d91cce7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -150,6 +150,7 @@ jobs: - v0.3-{{ checksum "setup.py" }} - run: pip install --upgrade pip - run: pip install .[ja,testing] + - run: python -m unidic download - save_cache: key: v0.3-custom_tokenizers-{{ checksum "setup.py" }} paths: diff --git a/setup.py b/setup.py index 206c3e35409..5f85db9c282 100644 --- a/setup.py +++ b/setup.py @@ -65,7 +65,7 @@ if stale_egg_info.exists(): extras = {} -extras["ja"] = ["fugashi>=1.0", "ipadic>=1.0,<2.0"] +extras["ja"] = ["fugashi>=1.0", "ipadic>=1.0.0,<2.0", "unidic_lite>=1.0.7", "unidic>=1.0.2"] extras["sklearn"] = ["scikit-learn"] # keras2onnx and onnxconverter-common version is specific through a commit until 1.7.0 lands on pypi diff --git a/src/transformers/tokenization_bert_japanese.py b/src/transformers/tokenization_bert_japanese.py index c3ede2c47e5..e476f463d48 100644 --- a/src/transformers/tokenization_bert_japanese.py +++ b/src/transformers/tokenization_bert_japanese.py @@ -167,30 +167,89 @@ class BertJapaneseTokenizer(BertTokenizer): class MecabTokenizer: """Runs basic tokenization with MeCab morphological parser.""" - def __init__(self, do_lower_case=False, never_split=None, normalize_text=True, mecab_option: Optional[str] = None): + def __init__( + self, + do_lower_case=False, + never_split=None, + normalize_text=True, + mecab_dic: Optional[str] = "ipadic", + mecab_option: Optional[str] = None, + ): """Constructs a MecabTokenizer. Args: **do_lower_case**: (`optional`) boolean (default True) - Whether to lower case the input. + Whether to lowercase the input. **never_split**: (`optional`) list of str Kept for backward compatibility purposes. Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) - List of token not to split. + List of tokens not to split. **normalize_text**: (`optional`) boolean (default True) Whether to apply unicode normalization to text before tokenization. - **mecab_option**: (`optional`) string passed to `MeCab.Tagger` constructor (default "") + **mecab_dic**: (`optional`) string (default "ipadic") + Name of dictionary to be used for MeCab initialization. + If you are using a system-installed dictionary, set thi option to `None` and modify `mecab_option`. + **mecab_option**: (`optional`) string + String passed to MeCab constructor. """ self.do_lower_case = do_lower_case self.never_split = never_split if never_split is not None else [] self.normalize_text = normalize_text - import fugashi - import ipadic + try: + import fugashi + except ModuleNotFoundError as error: + raise error( + "You need to install fugashi to use MecabTokenizer." + "See https://pypi.org/project/fugashi/ for installation." + ) - # Use ipadic by default (later options can override it) mecab_option = mecab_option or "" - mecab_option = ipadic.MECAB_ARGS + " " + mecab_option + + if mecab_dic is not None: + if mecab_dic == "ipadic": + try: + import ipadic + except ModuleNotFoundError as error: + raise error( + "The ipadic dictionary is not installed. " + "See https://github.com/polm/ipadic-py for installation." + ) + + dic_dir = ipadic.DICDIR + + elif mecab_dic == "unidic_lite": + try: + import unidic_lite + except ModuleNotFoundError as error: + raise error( + "The unidic_lite dictionary is not installed. " + "See https://github.com/polm/unidic-lite for installation." + ) + + dic_dir = unidic_lite.DICDIR + + elif mecab_dic == "unidic": + try: + import unidic + except ModuleNotFoundError as error: + raise error( + "The unidic dictionary is not installed. " + "See https://github.com/polm/unidic-py for installation." + ) + + dic_dir = unidic.DICDIR + if not os.path.isdir(dic_dir): + raise RuntimeError( + "The unidic dictionary itself is not found." + "See https://github.com/polm/unidic-py for installation." + ) + + else: + raise ValueError("Invalid mecab_dic is specified.") + + mecabrc = os.path.join(dic_dir, "mecabrc") + mecab_option = "-d {} -r {} ".format(dic_dir, mecabrc) + mecab_option self.mecab = fugashi.GenericTagger(mecab_option) @@ -213,7 +272,7 @@ class MecabTokenizer: return tokens -class CharacterTokenizer(object): +class CharacterTokenizer: """Runs Character tokenziation.""" def __init__(self, vocab, unk_token, normalize_text=True): @@ -247,7 +306,7 @@ class CharacterTokenizer(object): text = unicodedata.normalize("NFKC", text) output_tokens = [] - for i, char in enumerate(text): + for char in text: if char not in self.vocab: output_tokens.append(self.unk_token) continue diff --git a/tests/test_tokenization_bert_japanese.py b/tests/test_tokenization_bert_japanese.py index 03ff26e3a05..b14f19f9ada 100644 --- a/tests/test_tokenization_bert_japanese.py +++ b/tests/test_tokenization_bert_japanese.py @@ -87,16 +87,38 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"]) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14]) - def test_mecab_tokenizer(self): - tokenizer = MecabTokenizer() + def test_mecab_tokenizer_ipadic(self): + tokenizer = MecabTokenizer(mecab_dic="ipadic") self.assertListEqual( tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "), ["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"], ) + def test_mecab_tokenizer_unidic_lite(self): + try: + tokenizer = MecabTokenizer(mecab_dic="unidic_lite") + except ModuleNotFoundError: + return + + self.assertListEqual( + tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "), + ["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"], + ) + + def test_mecab_tokenizer_unidic(self): + try: + tokenizer = MecabTokenizer(mecab_dic="unidic") + except ModuleNotFoundError: + return + + self.assertListEqual( + tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "), + ["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"], + ) + def test_mecab_tokenizer_lower(self): - tokenizer = MecabTokenizer(do_lower_case=True) + tokenizer = MecabTokenizer(do_lower_case=True, mecab_dic="ipadic") self.assertListEqual( tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "), @@ -118,7 +140,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ) def test_mecab_tokenizer_no_normalize(self): - tokenizer = MecabTokenizer(normalize_text=False) + tokenizer = MecabTokenizer(normalize_text=False, mecab_dic="ipadic") self.assertListEqual( tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),