diff --git a/.circleci/config.yml b/.circleci/config.yml index 06c621621f6..aef10586cdc 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -409,6 +409,16 @@ jobs: keys: - v0.5-custom_tokenizers-{{ checksum "setup.py" }} - v0.5-custom_tokenizers- + - run: sudo apt-get -y update && sudo apt-get install -y cmake + - run: + name: install jumanpp + command: | + wget https://github.com/ku-nlp/jumanpp/releases/download/v2.0.0-rc3/jumanpp-2.0.0-rc3.tar.xz + tar xvf jumanpp-2.0.0-rc3.tar.xz + mkdir jumanpp-2.0.0-rc3/bld + cd jumanpp-2.0.0-rc3/bld + sudo cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local + sudo make install - run: pip install --upgrade pip - run: pip install .[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba] - run: python -m unidic download diff --git a/setup.py b/setup.py index b82de85b9b2..84c5df4793e 100644 --- a/setup.py +++ b/setup.py @@ -170,6 +170,9 @@ _deps = [ "unidic_lite>=1.0.7", "uvicorn", "beautifulsoup4", + "sudachipy>=0.6.6", + "sudachidict_core>=20220729", + "pyknp>=0.6.1", ] @@ -239,7 +242,7 @@ class DepsTableUpdateCommand(Command): extras = {} -extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic") +extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic", "sudachipy", "sudachidict_core", "pyknp") extras["sklearn"] = deps_list("scikit-learn") extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx", "tensorflow-text") diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index d8d2fce767a..48a803fcdfb 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -76,4 +76,7 @@ deps = { "unidic_lite": "unidic_lite>=1.0.7", "uvicorn": "uvicorn", "beautifulsoup4": "beautifulsoup4", + "sudachipy": "sudachipy>=0.6.6", + "sudachidict_core": "sudachidict_core>=20220729", + "pyknp": "pyknp>=0.6.1", } diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py index 0b33e858a10..0b6ccab3c49 100644 --- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py +++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py @@ -77,7 +77,7 @@ PRETRAINED_INIT_CONFIGURATION = { class BertJapaneseTokenizer(BertTokenizer): r""" - Construct a BERT tokenizer for Japanese text, based on a MecabTokenizer. + Construct a BERT tokenizer for Japanese text. Args: vocab_file (`str`): @@ -89,11 +89,15 @@ class BertJapaneseTokenizer(BertTokenizer): do_subword_tokenize (`bool`, *optional*, defaults to `True`): Whether to do subword tokenization. word_tokenizer_type (`str`, *optional*, defaults to `"basic"`): - Type of word tokenizer. + Type of word tokenizer. Choose from ["basic", "mecab", "sudachi", "jumanpp"]. subword_tokenizer_type (`str`, *optional*, defaults to `"wordpiece"`): - Type of subword tokenizer. - mecab_kwargs (`str`, *optional*): + Type of subword tokenizer. Choose from ["wordpiece", "character"]. + mecab_kwargs (`dict`, *optional*): Dictionary passed to the `MecabTokenizer` constructor. + sudachi_kwargs (`dict`, *optional*): + Dictionary passed to the `SudachiTokenizer` constructor. + jumanpp_kwargs (`dict`, *optional*): + Dictionary passed to the `JumanppTokenizer` constructor. """ vocab_files_names = VOCAB_FILES_NAMES @@ -116,6 +120,8 @@ class BertJapaneseTokenizer(BertTokenizer): cls_token="[CLS]", mask_token="[MASK]", mecab_kwargs=None, + sudachi_kwargs=None, + jumanpp_kwargs=None, **kwargs ): super(BertTokenizer, self).__init__( @@ -131,6 +137,8 @@ class BertJapaneseTokenizer(BertTokenizer): subword_tokenizer_type=subword_tokenizer_type, never_split=never_split, mecab_kwargs=mecab_kwargs, + sudachi_kwargs=sudachi_kwargs, + jumanpp_kwargs=jumanpp_kwargs, **kwargs, ) # ^^ We call the grandparent's init, not the parent's. @@ -148,6 +156,8 @@ class BertJapaneseTokenizer(BertTokenizer): self.lower_case = do_lower_case self.never_split = never_split self.mecab_kwargs = copy.deepcopy(mecab_kwargs) + self.sudachi_kwargs = copy.deepcopy(sudachi_kwargs) + self.jumanpp_kwargs = copy.deepcopy(jumanpp_kwargs) if do_word_tokenize: if word_tokenizer_type == "basic": self.word_tokenizer = BasicTokenizer( @@ -157,6 +167,14 @@ class BertJapaneseTokenizer(BertTokenizer): self.word_tokenizer = MecabTokenizer( do_lower_case=do_lower_case, never_split=never_split, **(mecab_kwargs or {}) ) + elif word_tokenizer_type == "sudachi": + self.word_tokenizer = SudachiTokenizer( + do_lower_case=do_lower_case, never_split=never_split, **(sudachi_kwargs or {}) + ) + elif word_tokenizer_type == "jumanpp": + self.word_tokenizer = JumanppTokenizer( + do_lower_case=do_lower_case, never_split=never_split, **(jumanpp_kwargs or {}) + ) else: raise ValueError(f"Invalid word_tokenizer_type '{word_tokenizer_type}' is specified.") @@ -176,7 +194,7 @@ class BertJapaneseTokenizer(BertTokenizer): def __getstate__(self): state = dict(self.__dict__) - if self.word_tokenizer_type == "mecab": + if self.word_tokenizer_type in ["mecab", "sudachi", "jumanpp"]: del state["word_tokenizer"] return state @@ -186,6 +204,14 @@ class BertJapaneseTokenizer(BertTokenizer): self.word_tokenizer = MecabTokenizer( do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.mecab_kwargs or {}) ) + elif self.word_tokenizer_type == "sudachi": + self.word_tokenizer = SudachiTokenizer( + do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.sudachi_kwargs or {}) + ) + elif self.word_tokenizer_type == "jumanpp": + self.word_tokenizer = JumanppTokenizer( + do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.jumanpp_kwargs or {}) + ) def _tokenize(self, text): if self.do_word_tokenize: @@ -309,6 +335,157 @@ class MecabTokenizer: return tokens +class SudachiTokenizer: + """Runs basic tokenization with Sudachi morphological parser.""" + + def __init__( + self, + do_lower_case=False, + never_split=None, + normalize_text=True, + trim_whitespace=False, + sudachi_split_mode="A", + sudachi_config_path=None, + sudachi_resource_dir=None, + sudachi_dict_type="core", + ): + """ + Constructs a SudachiTokenizer. + + Args: + **do_lower_case**: (*optional*) boolean (default True) + Whether to lowercase the input. + **never_split**: (*optional*) list of str + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + [`PreTrainedTokenizer.tokenize`]) List of tokens not to split. + **normalize_text**: (*optional*) boolean (default True) + Whether to apply unicode normalization to text before tokenization. + **trim_whitespace**: (*optional*) boolean (default False) + Whether to trim all whitespace, tab, newline from tokens. + **sudachi_split_mode**: (*optional*) string + Split mode of sudachi, choose from "A", "B", "C". + **sudachi_config_path**: (*optional*) string + **sudachi_resource_dir**: (*optional*) string + **sudachi_dict_type**: (*optional*) string + dict type of sudachi, choose from "small", "core", "full". + """ + + self.do_lower_case = do_lower_case + self.never_split = never_split if never_split is not None else [] + self.normalize_text = normalize_text + self.trim_whitespace = trim_whitespace + + try: + from sudachipy import dictionary, tokenizer + except ImportError: + raise ImportError( + "You need to install sudachipy to use SudachiTokenizer. " + "See https://github.com/WorksApplications/SudachiPy for installation." + ) + + if sudachi_split_mode == "A": + self.split_mode = tokenizer.Tokenizer.SplitMode.A + elif sudachi_split_mode == "B": + self.split_mode = tokenizer.Tokenizer.SplitMode.B + elif sudachi_split_mode == "C": + self.split_mode = tokenizer.Tokenizer.SplitMode.C + else: + raise ValueError("Invalid sudachi_split_mode is specified.") + + self.sudachi = dictionary.Dictionary( + config_path=sudachi_config_path, resource_dir=sudachi_resource_dir, dict_type=sudachi_dict_type + ).create(self.split_mode) + + def tokenize(self, text, never_split=None, **kwargs): + """Tokenizes a piece of text.""" + if self.normalize_text: + text = unicodedata.normalize("NFKC", text) + + never_split = self.never_split + (never_split if never_split is not None else []) + tokens = [] + + for word in self.sudachi.tokenize(text): + token = word.surface() + + if self.do_lower_case and token not in never_split: + token = token.lower() + + if self.trim_whitespace: + if token.strip() == "": + continue + else: + token = token.strip() + + tokens.append(token) + + return tokens + + +class JumanppTokenizer: + """Runs basic tokenization with jumanpp morphological parser.""" + + def __init__( + self, + do_lower_case=False, + never_split=None, + normalize_text=True, + trim_whitespace=False, + ): + """ + Constructs a JumanppTokenizer. + + Args: + **do_lower_case**: (*optional*) boolean (default True) + Whether to lowercase the input. + **never_split**: (*optional*) list of str + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + [`PreTrainedTokenizer.tokenize`]) List of tokens not to split. + **normalize_text**: (*optional*) boolean (default True) + Whether to apply unicode normalization to text before tokenization. + **trim_whitespace**: (*optional*) boolean (default False) + Whether to trim all whitespace, tab, newline from tokens. + """ + + self.do_lower_case = do_lower_case + self.never_split = never_split if never_split is not None else [] + self.normalize_text = normalize_text + self.trim_whitespace = trim_whitespace + + try: + import pyknp + except ImportError: + raise ImportError( + "You need to install pyknp to use JumanppTokenizer. " + "See https://github.com/ku-nlp/pyknp for installation." + ) + + self.juman = pyknp.Juman(jumanpp=True) + + def tokenize(self, text, never_split=None, **kwargs): + """Tokenizes a piece of text.""" + if self.normalize_text: + text = unicodedata.normalize("NFKC", text) + + never_split = self.never_split + (never_split if never_split is not None else []) + tokens = [] + + for mrph in self.juman.analysis(text).mrph_list(): + token = mrph.midasi + + if self.do_lower_case and token not in never_split: + token = token.lower() + + if self.trim_whitespace: + if token.strip() == "": + continue + else: + token = token.strip() + + tokens.append(token) + + return tokens + + class CharacterTokenizer: """Runs Character tokenization.""" diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 7e3242e94c9..08409b6e092 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -52,6 +52,7 @@ from .utils import ( is_flax_available, is_ftfy_available, is_ipex_available, + is_jumanpp_available, is_librosa_available, is_onnx_available, is_pandas_available, @@ -66,6 +67,7 @@ from .utils import ( is_sentencepiece_available, is_soundfile_availble, is_spacy_available, + is_sudachi_available, is_tensorflow_probability_available, is_tensorflow_text_available, is_tf2onnx_available, @@ -671,6 +673,20 @@ def require_usr_bin_time(test_case): return unittest.skipUnless(cmd_exists("/usr/bin/time"), "test requires /usr/bin/time")(test_case) +def require_sudachi(test_case): + """ + Decorator marking a test that requires sudachi + """ + return unittest.skipUnless(is_sudachi_available(), "test requires sudachi")(test_case) + + +def require_jumanpp(test_case): + """ + Decorator marking a test that requires jumanpp + """ + return unittest.skipUnless(is_jumanpp_available(), "test requires jumanpp")(test_case) + + def get_gpu_count(): """ Return the number of available gpus (regardless of whether torch, tf or jax is used) diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index fdd1c376dab..24d32d0a012 100644 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -98,6 +98,7 @@ from .import_utils import ( is_ftfy_available, is_in_notebook, is_ipex_available, + is_jumanpp_available, is_librosa_available, is_ninja_available, is_onnx_available, @@ -121,6 +122,7 @@ from .import_utils import ( is_soundfile_availble, is_spacy_available, is_speech_available, + is_sudachi_available, is_tensorflow_probability_available, is_tensorflow_text_available, is_tf2onnx_available, diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index 81b7c478c1b..2fc52b52a20 100644 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -18,6 +18,7 @@ Import utilities: Utilities related to imports and our lazy inits. import importlib.util import json import os +import shutil import sys import warnings from collections import OrderedDict @@ -671,6 +672,14 @@ def is_ccl_available(): return _is_ccl_available +def is_sudachi_available(): + return importlib.util.find_spec("sudachipy") is not None + + +def is_jumanpp_available(): + return (importlib.util.find_spec("pyknp") is not None) and (shutil.which("jumanpp") is not None) + + # docstyle-ignore DATASETS_IMPORT_ERROR = """ {0} requires the 🤗 Datasets library but it was not found in your environment. You can install it with: diff --git a/tests/models/bert_japanese/test_tokenization_bert_japanese.py b/tests/models/bert_japanese/test_tokenization_bert_japanese.py index 86b3f16f101..9aba5c3705a 100644 --- a/tests/models/bert_japanese/test_tokenization_bert_japanese.py +++ b/tests/models/bert_japanese/test_tokenization_bert_japanese.py @@ -24,10 +24,12 @@ from transformers.models.bert_japanese.tokenization_bert_japanese import ( BertJapaneseTokenizer, BertTokenizer, CharacterTokenizer, + JumanppTokenizer, MecabTokenizer, + SudachiTokenizer, WordpieceTokenizer, ) -from transformers.testing_utils import custom_tokenizers +from transformers.testing_utils import custom_tokenizers, require_jumanpp, require_sudachi from ...test_tokenization_common import TokenizerTesterMixin @@ -172,6 +174,150 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", " ", "。"], ) + @require_sudachi + def test_pickle_sudachi_tokenizer(self): + tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="sudachi") + self.assertIsNotNone(tokenizer) + + text = "こんにちは、世界。\nこんばんは、世界。" + tokens = tokenizer.tokenize(text) + self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"]) + self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14]) + + filename = os.path.join(self.tmpdirname, "tokenizer.bin") + with open(filename, "wb") as handle: + pickle.dump(tokenizer, handle) + + with open(filename, "rb") as handle: + tokenizer_new = pickle.load(handle) + + tokens_loaded = tokenizer_new.tokenize(text) + + self.assertListEqual(tokens, tokens_loaded) + + @require_sudachi + def test_sudachi_tokenizer_core(self): + tokenizer = SudachiTokenizer(sudachi_dict_type="core") + + self.assertListEqual( + tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "), + # fmt: off + [" ", "\t", "アップル", "ストア", "で", "iPhone", "8", " ", "が", " ", " ", "\n ", "発売", "さ", "れ", "た", " ", "。", " ", " "], + # fmt: on + ) + + @require_sudachi + def test_sudachi_tokenizer_split_mode_A(self): + tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="A") + + self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国", "人", "参政", "権"]) + + @require_sudachi + def test_sudachi_tokenizer_split_mode_B(self): + tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="B") + + self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国人", "参政権"]) + + @require_sudachi + def test_sudachi_tokenizer_split_mode_C(self): + tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="C") + + self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国人参政権"]) + + @require_sudachi + def test_sudachi_tokenizer_lower(self): + tokenizer = SudachiTokenizer(do_lower_case=True, sudachi_dict_type="core") + + self.assertListEqual( + tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "), + # fmt: off + [" ", "\t", "アップル", "ストア", "で", "iphone", "8", " ", "が", " ", " ", "\n ", "発売", "さ", "れ", "た", " ", "。", " ", " "], + # fmt: on + ) + + @require_sudachi + def test_sudachi_tokenizer_no_normalize(self): + tokenizer = SudachiTokenizer(normalize_text=False, sudachi_dict_type="core") + + self.assertListEqual( + tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "), + # fmt: off + [" ", "\t", "アップル", "ストア", "で", "iPhone", "8", " ", "が", " ", " ", "\n ", "発売", "さ", "れ", "た", "\u3000", "。", " ", " "], + # fmt: on + ) + + @require_sudachi + def test_sudachi_tokenizer_trim_whitespace(self): + tokenizer = SudachiTokenizer(trim_whitespace=True, sudachi_dict_type="core") + + self.assertListEqual( + tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "), + ["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"], + ) + + @require_jumanpp + def test_pickle_jumanpp_tokenizer(self): + tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="jumanpp") + self.assertIsNotNone(tokenizer) + + text = "こんにちは、世界。\nこんばんは、世界。" + tokens = tokenizer.tokenize(text) + self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"]) + self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14]) + + filename = os.path.join(self.tmpdirname, "tokenizer.bin") + with open(filename, "wb") as handle: + pickle.dump(tokenizer, handle) + + with open(filename, "rb") as handle: + tokenizer_new = pickle.load(handle) + + tokens_loaded = tokenizer_new.tokenize(text) + + self.assertListEqual(tokens, tokens_loaded) + + @require_jumanpp + def test_jumanpp_tokenizer(self): + tokenizer = JumanppTokenizer() + + self.assertListEqual( + tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "), + # fmt: off + ["アップル", "ストア", "で", "iPhone", "8", "\u3000", "が", "\u3000", "\u3000", "\u3000", "発売", "さ", "れた", "\u3000", "。"], + # fmt: on + ) + + @require_jumanpp + def test_jumanpp_tokenizer_lower(self): + tokenizer = JumanppTokenizer(do_lower_case=True) + + self.assertListEqual( + tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "), + # fmt: off + ["アップル", "ストア", "で", "iphone", "8", "\u3000", "が", "\u3000", "\u3000", "\u3000", "発売", "さ", "れた", "\u3000", "。"], + # fmt: on + ) + + @require_jumanpp + def test_jumanpp_tokenizer_no_normalize(self): + tokenizer = JumanppTokenizer(normalize_text=False) + + self.assertListEqual( + tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "), + # fmt: off + ["ア", "ッ", "フ", "゚", "ル", "ストア", "で", "iPhone", "8", "\u3000", "が", "\u3000", "\u3000", "\u3000", "発売", "さ", "れた", "\u3000", "。"], + # fmt: on + ) + + @require_jumanpp + def test_jumanpp_tokenizer_trim_whitespace(self): + tokenizer = JumanppTokenizer(trim_whitespace=True) + + self.assertListEqual( + tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "), + ["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れた", "。"], + ) + def test_wordpiece_tokenizer(self): vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こんにちは", "こん", "にちは", "ばんは", "##こん", "##にちは", "##ばんは"]