Add sudachi and jumanpp tokenizers for bert_japanese (#19043)

* add sudachipy and jumanpp tokenizers for bert_japanese

* use ImportError instead of ModuleNotFoundError in SudachiTokenizer and JumanppTokenizer

* put test cases of test_tokenization_bert_japanese in one line

* add require_sudachi and require_jumanpp decorator for testing

* add sudachi and pyknp(jumanpp) to dependencies

* remove sudachi_dict_small and sudachi_dict_full from dependencies

* empty commit for ci
This commit is contained in:
r-terada 2022-10-06 00:41:37 +09:00 committed by GitHub
parent 60db81ff60
commit 2f53ab5745
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 373 additions and 7 deletions

View File

@ -409,6 +409,16 @@ jobs:
keys: keys:
- v0.5-custom_tokenizers-{{ checksum "setup.py" }} - v0.5-custom_tokenizers-{{ checksum "setup.py" }}
- v0.5-custom_tokenizers- - v0.5-custom_tokenizers-
- run: sudo apt-get -y update && sudo apt-get install -y cmake
- run:
name: install jumanpp
command: |
wget https://github.com/ku-nlp/jumanpp/releases/download/v2.0.0-rc3/jumanpp-2.0.0-rc3.tar.xz
tar xvf jumanpp-2.0.0-rc3.tar.xz
mkdir jumanpp-2.0.0-rc3/bld
cd jumanpp-2.0.0-rc3/bld
sudo cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local
sudo make install
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install .[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba] - run: pip install .[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]
- run: python -m unidic download - run: python -m unidic download

View File

@ -170,6 +170,9 @@ _deps = [
"unidic_lite>=1.0.7", "unidic_lite>=1.0.7",
"uvicorn", "uvicorn",
"beautifulsoup4", "beautifulsoup4",
"sudachipy>=0.6.6",
"sudachidict_core>=20220729",
"pyknp>=0.6.1",
] ]
@ -239,7 +242,7 @@ class DepsTableUpdateCommand(Command):
extras = {} extras = {}
extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic") extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic", "sudachipy", "sudachidict_core", "pyknp")
extras["sklearn"] = deps_list("scikit-learn") extras["sklearn"] = deps_list("scikit-learn")
extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx", "tensorflow-text") extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx", "tensorflow-text")

View File

@ -76,4 +76,7 @@ deps = {
"unidic_lite": "unidic_lite>=1.0.7", "unidic_lite": "unidic_lite>=1.0.7",
"uvicorn": "uvicorn", "uvicorn": "uvicorn",
"beautifulsoup4": "beautifulsoup4", "beautifulsoup4": "beautifulsoup4",
"sudachipy": "sudachipy>=0.6.6",
"sudachidict_core": "sudachidict_core>=20220729",
"pyknp": "pyknp>=0.6.1",
} }

View File

@ -77,7 +77,7 @@ PRETRAINED_INIT_CONFIGURATION = {
class BertJapaneseTokenizer(BertTokenizer): class BertJapaneseTokenizer(BertTokenizer):
r""" r"""
Construct a BERT tokenizer for Japanese text, based on a MecabTokenizer. Construct a BERT tokenizer for Japanese text.
Args: Args:
vocab_file (`str`): vocab_file (`str`):
@ -89,11 +89,15 @@ class BertJapaneseTokenizer(BertTokenizer):
do_subword_tokenize (`bool`, *optional*, defaults to `True`): do_subword_tokenize (`bool`, *optional*, defaults to `True`):
Whether to do subword tokenization. Whether to do subword tokenization.
word_tokenizer_type (`str`, *optional*, defaults to `"basic"`): word_tokenizer_type (`str`, *optional*, defaults to `"basic"`):
Type of word tokenizer. Type of word tokenizer. Choose from ["basic", "mecab", "sudachi", "jumanpp"].
subword_tokenizer_type (`str`, *optional*, defaults to `"wordpiece"`): subword_tokenizer_type (`str`, *optional*, defaults to `"wordpiece"`):
Type of subword tokenizer. Type of subword tokenizer. Choose from ["wordpiece", "character"].
mecab_kwargs (`str`, *optional*): mecab_kwargs (`dict`, *optional*):
Dictionary passed to the `MecabTokenizer` constructor. Dictionary passed to the `MecabTokenizer` constructor.
sudachi_kwargs (`dict`, *optional*):
Dictionary passed to the `SudachiTokenizer` constructor.
jumanpp_kwargs (`dict`, *optional*):
Dictionary passed to the `JumanppTokenizer` constructor.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@ -116,6 +120,8 @@ class BertJapaneseTokenizer(BertTokenizer):
cls_token="[CLS]", cls_token="[CLS]",
mask_token="[MASK]", mask_token="[MASK]",
mecab_kwargs=None, mecab_kwargs=None,
sudachi_kwargs=None,
jumanpp_kwargs=None,
**kwargs **kwargs
): ):
super(BertTokenizer, self).__init__( super(BertTokenizer, self).__init__(
@ -131,6 +137,8 @@ class BertJapaneseTokenizer(BertTokenizer):
subword_tokenizer_type=subword_tokenizer_type, subword_tokenizer_type=subword_tokenizer_type,
never_split=never_split, never_split=never_split,
mecab_kwargs=mecab_kwargs, mecab_kwargs=mecab_kwargs,
sudachi_kwargs=sudachi_kwargs,
jumanpp_kwargs=jumanpp_kwargs,
**kwargs, **kwargs,
) )
# ^^ We call the grandparent's init, not the parent's. # ^^ We call the grandparent's init, not the parent's.
@ -148,6 +156,8 @@ class BertJapaneseTokenizer(BertTokenizer):
self.lower_case = do_lower_case self.lower_case = do_lower_case
self.never_split = never_split self.never_split = never_split
self.mecab_kwargs = copy.deepcopy(mecab_kwargs) self.mecab_kwargs = copy.deepcopy(mecab_kwargs)
self.sudachi_kwargs = copy.deepcopy(sudachi_kwargs)
self.jumanpp_kwargs = copy.deepcopy(jumanpp_kwargs)
if do_word_tokenize: if do_word_tokenize:
if word_tokenizer_type == "basic": if word_tokenizer_type == "basic":
self.word_tokenizer = BasicTokenizer( self.word_tokenizer = BasicTokenizer(
@ -157,6 +167,14 @@ class BertJapaneseTokenizer(BertTokenizer):
self.word_tokenizer = MecabTokenizer( self.word_tokenizer = MecabTokenizer(
do_lower_case=do_lower_case, never_split=never_split, **(mecab_kwargs or {}) do_lower_case=do_lower_case, never_split=never_split, **(mecab_kwargs or {})
) )
elif word_tokenizer_type == "sudachi":
self.word_tokenizer = SudachiTokenizer(
do_lower_case=do_lower_case, never_split=never_split, **(sudachi_kwargs or {})
)
elif word_tokenizer_type == "jumanpp":
self.word_tokenizer = JumanppTokenizer(
do_lower_case=do_lower_case, never_split=never_split, **(jumanpp_kwargs or {})
)
else: else:
raise ValueError(f"Invalid word_tokenizer_type '{word_tokenizer_type}' is specified.") raise ValueError(f"Invalid word_tokenizer_type '{word_tokenizer_type}' is specified.")
@ -176,7 +194,7 @@ class BertJapaneseTokenizer(BertTokenizer):
def __getstate__(self): def __getstate__(self):
state = dict(self.__dict__) state = dict(self.__dict__)
if self.word_tokenizer_type == "mecab": if self.word_tokenizer_type in ["mecab", "sudachi", "jumanpp"]:
del state["word_tokenizer"] del state["word_tokenizer"]
return state return state
@ -186,6 +204,14 @@ class BertJapaneseTokenizer(BertTokenizer):
self.word_tokenizer = MecabTokenizer( self.word_tokenizer = MecabTokenizer(
do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.mecab_kwargs or {}) do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.mecab_kwargs or {})
) )
elif self.word_tokenizer_type == "sudachi":
self.word_tokenizer = SudachiTokenizer(
do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.sudachi_kwargs or {})
)
elif self.word_tokenizer_type == "jumanpp":
self.word_tokenizer = JumanppTokenizer(
do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.jumanpp_kwargs or {})
)
def _tokenize(self, text): def _tokenize(self, text):
if self.do_word_tokenize: if self.do_word_tokenize:
@ -309,6 +335,157 @@ class MecabTokenizer:
return tokens return tokens
class SudachiTokenizer:
"""Runs basic tokenization with Sudachi morphological parser."""
def __init__(
self,
do_lower_case=False,
never_split=None,
normalize_text=True,
trim_whitespace=False,
sudachi_split_mode="A",
sudachi_config_path=None,
sudachi_resource_dir=None,
sudachi_dict_type="core",
):
"""
Constructs a SudachiTokenizer.
Args:
**do_lower_case**: (*optional*) boolean (default True)
Whether to lowercase the input.
**never_split**: (*optional*) list of str
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
**normalize_text**: (*optional*) boolean (default True)
Whether to apply unicode normalization to text before tokenization.
**trim_whitespace**: (*optional*) boolean (default False)
Whether to trim all whitespace, tab, newline from tokens.
**sudachi_split_mode**: (*optional*) string
Split mode of sudachi, choose from "A", "B", "C".
**sudachi_config_path**: (*optional*) string
**sudachi_resource_dir**: (*optional*) string
**sudachi_dict_type**: (*optional*) string
dict type of sudachi, choose from "small", "core", "full".
"""
self.do_lower_case = do_lower_case
self.never_split = never_split if never_split is not None else []
self.normalize_text = normalize_text
self.trim_whitespace = trim_whitespace
try:
from sudachipy import dictionary, tokenizer
except ImportError:
raise ImportError(
"You need to install sudachipy to use SudachiTokenizer. "
"See https://github.com/WorksApplications/SudachiPy for installation."
)
if sudachi_split_mode == "A":
self.split_mode = tokenizer.Tokenizer.SplitMode.A
elif sudachi_split_mode == "B":
self.split_mode = tokenizer.Tokenizer.SplitMode.B
elif sudachi_split_mode == "C":
self.split_mode = tokenizer.Tokenizer.SplitMode.C
else:
raise ValueError("Invalid sudachi_split_mode is specified.")
self.sudachi = dictionary.Dictionary(
config_path=sudachi_config_path, resource_dir=sudachi_resource_dir, dict_type=sudachi_dict_type
).create(self.split_mode)
def tokenize(self, text, never_split=None, **kwargs):
"""Tokenizes a piece of text."""
if self.normalize_text:
text = unicodedata.normalize("NFKC", text)
never_split = self.never_split + (never_split if never_split is not None else [])
tokens = []
for word in self.sudachi.tokenize(text):
token = word.surface()
if self.do_lower_case and token not in never_split:
token = token.lower()
if self.trim_whitespace:
if token.strip() == "":
continue
else:
token = token.strip()
tokens.append(token)
return tokens
class JumanppTokenizer:
"""Runs basic tokenization with jumanpp morphological parser."""
def __init__(
self,
do_lower_case=False,
never_split=None,
normalize_text=True,
trim_whitespace=False,
):
"""
Constructs a JumanppTokenizer.
Args:
**do_lower_case**: (*optional*) boolean (default True)
Whether to lowercase the input.
**never_split**: (*optional*) list of str
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
**normalize_text**: (*optional*) boolean (default True)
Whether to apply unicode normalization to text before tokenization.
**trim_whitespace**: (*optional*) boolean (default False)
Whether to trim all whitespace, tab, newline from tokens.
"""
self.do_lower_case = do_lower_case
self.never_split = never_split if never_split is not None else []
self.normalize_text = normalize_text
self.trim_whitespace = trim_whitespace
try:
import pyknp
except ImportError:
raise ImportError(
"You need to install pyknp to use JumanppTokenizer. "
"See https://github.com/ku-nlp/pyknp for installation."
)
self.juman = pyknp.Juman(jumanpp=True)
def tokenize(self, text, never_split=None, **kwargs):
"""Tokenizes a piece of text."""
if self.normalize_text:
text = unicodedata.normalize("NFKC", text)
never_split = self.never_split + (never_split if never_split is not None else [])
tokens = []
for mrph in self.juman.analysis(text).mrph_list():
token = mrph.midasi
if self.do_lower_case and token not in never_split:
token = token.lower()
if self.trim_whitespace:
if token.strip() == "":
continue
else:
token = token.strip()
tokens.append(token)
return tokens
class CharacterTokenizer: class CharacterTokenizer:
"""Runs Character tokenization.""" """Runs Character tokenization."""

View File

@ -52,6 +52,7 @@ from .utils import (
is_flax_available, is_flax_available,
is_ftfy_available, is_ftfy_available,
is_ipex_available, is_ipex_available,
is_jumanpp_available,
is_librosa_available, is_librosa_available,
is_onnx_available, is_onnx_available,
is_pandas_available, is_pandas_available,
@ -66,6 +67,7 @@ from .utils import (
is_sentencepiece_available, is_sentencepiece_available,
is_soundfile_availble, is_soundfile_availble,
is_spacy_available, is_spacy_available,
is_sudachi_available,
is_tensorflow_probability_available, is_tensorflow_probability_available,
is_tensorflow_text_available, is_tensorflow_text_available,
is_tf2onnx_available, is_tf2onnx_available,
@ -671,6 +673,20 @@ def require_usr_bin_time(test_case):
return unittest.skipUnless(cmd_exists("/usr/bin/time"), "test requires /usr/bin/time")(test_case) return unittest.skipUnless(cmd_exists("/usr/bin/time"), "test requires /usr/bin/time")(test_case)
def require_sudachi(test_case):
"""
Decorator marking a test that requires sudachi
"""
return unittest.skipUnless(is_sudachi_available(), "test requires sudachi")(test_case)
def require_jumanpp(test_case):
"""
Decorator marking a test that requires jumanpp
"""
return unittest.skipUnless(is_jumanpp_available(), "test requires jumanpp")(test_case)
def get_gpu_count(): def get_gpu_count():
""" """
Return the number of available gpus (regardless of whether torch, tf or jax is used) Return the number of available gpus (regardless of whether torch, tf or jax is used)

View File

@ -98,6 +98,7 @@ from .import_utils import (
is_ftfy_available, is_ftfy_available,
is_in_notebook, is_in_notebook,
is_ipex_available, is_ipex_available,
is_jumanpp_available,
is_librosa_available, is_librosa_available,
is_ninja_available, is_ninja_available,
is_onnx_available, is_onnx_available,
@ -121,6 +122,7 @@ from .import_utils import (
is_soundfile_availble, is_soundfile_availble,
is_spacy_available, is_spacy_available,
is_speech_available, is_speech_available,
is_sudachi_available,
is_tensorflow_probability_available, is_tensorflow_probability_available,
is_tensorflow_text_available, is_tensorflow_text_available,
is_tf2onnx_available, is_tf2onnx_available,

View File

@ -18,6 +18,7 @@ Import utilities: Utilities related to imports and our lazy inits.
import importlib.util import importlib.util
import json import json
import os import os
import shutil
import sys import sys
import warnings import warnings
from collections import OrderedDict from collections import OrderedDict
@ -671,6 +672,14 @@ def is_ccl_available():
return _is_ccl_available return _is_ccl_available
def is_sudachi_available():
return importlib.util.find_spec("sudachipy") is not None
def is_jumanpp_available():
return (importlib.util.find_spec("pyknp") is not None) and (shutil.which("jumanpp") is not None)
# docstyle-ignore # docstyle-ignore
DATASETS_IMPORT_ERROR = """ DATASETS_IMPORT_ERROR = """
{0} requires the 🤗 Datasets library but it was not found in your environment. You can install it with: {0} requires the 🤗 Datasets library but it was not found in your environment. You can install it with:

View File

@ -24,10 +24,12 @@ from transformers.models.bert_japanese.tokenization_bert_japanese import (
BertJapaneseTokenizer, BertJapaneseTokenizer,
BertTokenizer, BertTokenizer,
CharacterTokenizer, CharacterTokenizer,
JumanppTokenizer,
MecabTokenizer, MecabTokenizer,
SudachiTokenizer,
WordpieceTokenizer, WordpieceTokenizer,
) )
from transformers.testing_utils import custom_tokenizers from transformers.testing_utils import custom_tokenizers, require_jumanpp, require_sudachi
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin
@ -172,6 +174,150 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
["アップルストア", "", "iPhone", "", "", "発売", "", "", "", " ", ""], ["アップルストア", "", "iPhone", "", "", "発売", "", "", "", " ", ""],
) )
@require_sudachi
def test_pickle_sudachi_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="sudachi")
self.assertIsNotNone(tokenizer)
text = "こんにちは、世界。\nこんばんは、世界。"
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, ["こんにちは", "", "世界", "", "こん", "##ばんは", "", "世界", ""])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
filename = os.path.join(self.tmpdirname, "tokenizer.bin")
with open(filename, "wb") as handle:
pickle.dump(tokenizer, handle)
with open(filename, "rb") as handle:
tokenizer_new = pickle.load(handle)
tokens_loaded = tokenizer_new.tokenize(text)
self.assertListEqual(tokens, tokens_loaded)
@require_sudachi
def test_sudachi_tokenizer_core(self):
tokenizer = SudachiTokenizer(sudachi_dict_type="core")
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
# fmt: off
[" ", "\t", "アップル", "ストア", "", "iPhone", "8", " ", "", " ", " ", "\n ", "発売", "", "", "", " ", "", " ", " "],
# fmt: on
)
@require_sudachi
def test_sudachi_tokenizer_split_mode_A(self):
tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="A")
self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国", "", "参政", ""])
@require_sudachi
def test_sudachi_tokenizer_split_mode_B(self):
tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="B")
self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国人", "参政権"])
@require_sudachi
def test_sudachi_tokenizer_split_mode_C(self):
tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="C")
self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国人参政権"])
@require_sudachi
def test_sudachi_tokenizer_lower(self):
tokenizer = SudachiTokenizer(do_lower_case=True, sudachi_dict_type="core")
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
# fmt: off
[" ", "\t", "アップル", "ストア", "", "iphone", "8", " ", "", " ", " ", "\n ", "発売", "", "", "", " ", "", " ", " "],
# fmt: on
)
@require_sudachi
def test_sudachi_tokenizer_no_normalize(self):
tokenizer = SudachiTokenizer(normalize_text=False, sudachi_dict_type="core")
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
# fmt: off
[" ", "\t", "アップル", "ストア", "", "iPhone", "", " ", "", " ", " ", "\n ", "発売", "", "", "", "\u3000", "", " ", " "],
# fmt: on
)
@require_sudachi
def test_sudachi_tokenizer_trim_whitespace(self):
tokenizer = SudachiTokenizer(trim_whitespace=True, sudachi_dict_type="core")
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップル", "ストア", "", "iPhone", "8", "", "発売", "", "", "", ""],
)
@require_jumanpp
def test_pickle_jumanpp_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="jumanpp")
self.assertIsNotNone(tokenizer)
text = "こんにちは、世界。\nこんばんは、世界。"
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, ["こんにちは", "", "世界", "", "こん", "##ばんは", "", "世界", ""])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
filename = os.path.join(self.tmpdirname, "tokenizer.bin")
with open(filename, "wb") as handle:
pickle.dump(tokenizer, handle)
with open(filename, "rb") as handle:
tokenizer_new = pickle.load(handle)
tokens_loaded = tokenizer_new.tokenize(text)
self.assertListEqual(tokens, tokens_loaded)
@require_jumanpp
def test_jumanpp_tokenizer(self):
tokenizer = JumanppTokenizer()
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
# fmt: off
["アップル", "ストア", "", "iPhone", "8", "\u3000", "", "\u3000", "\u3000", "\u3000", "発売", "", "れた", "\u3000", ""],
# fmt: on
)
@require_jumanpp
def test_jumanpp_tokenizer_lower(self):
tokenizer = JumanppTokenizer(do_lower_case=True)
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
# fmt: off
["アップル", "ストア", "", "iphone", "8", "\u3000", "", "\u3000", "\u3000", "\u3000", "発売", "", "れた", "\u3000", ""],
# fmt: on
)
@require_jumanpp
def test_jumanpp_tokenizer_no_normalize(self):
tokenizer = JumanppTokenizer(normalize_text=False)
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
# fmt: off
["", "", "", "", "", "ストア", "", "iPhone", "", "\u3000", "", "\u3000", "\u3000", "\u3000", "発売", "", "れた", "\u3000", ""],
# fmt: on
)
@require_jumanpp
def test_jumanpp_tokenizer_trim_whitespace(self):
tokenizer = JumanppTokenizer(trim_whitespace=True)
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップル", "ストア", "", "iPhone", "8", "", "発売", "", "れた", ""],
)
def test_wordpiece_tokenizer(self): def test_wordpiece_tokenizer(self):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こんにちは", "こん", "にちは", "ばんは", "##こん", "##にちは", "##ばんは"] vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こんにちは", "こん", "にちは", "ばんは", "##こん", "##にちは", "##ばんは"]