Add sudachi and jumanpp tokenizers for bert_japanese (#19043)

* add sudachipy and jumanpp tokenizers for bert_japanese

* use ImportError instead of ModuleNotFoundError in SudachiTokenizer and JumanppTokenizer

* put test cases of test_tokenization_bert_japanese in one line

* add require_sudachi and require_jumanpp decorator for testing

* add sudachi and pyknp(jumanpp) to dependencies

* remove sudachi_dict_small and sudachi_dict_full from dependencies

* empty commit for ci
This commit is contained in:
r-terada 2022-10-06 00:41:37 +09:00 committed by GitHub
parent 60db81ff60
commit 2f53ab5745
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 373 additions and 7 deletions

View File

@ -409,6 +409,16 @@ jobs:
keys:
- v0.5-custom_tokenizers-{{ checksum "setup.py" }}
- v0.5-custom_tokenizers-
- run: sudo apt-get -y update && sudo apt-get install -y cmake
- run:
name: install jumanpp
command: |
wget https://github.com/ku-nlp/jumanpp/releases/download/v2.0.0-rc3/jumanpp-2.0.0-rc3.tar.xz
tar xvf jumanpp-2.0.0-rc3.tar.xz
mkdir jumanpp-2.0.0-rc3/bld
cd jumanpp-2.0.0-rc3/bld
sudo cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local
sudo make install
- run: pip install --upgrade pip
- run: pip install .[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]
- run: python -m unidic download

View File

@ -170,6 +170,9 @@ _deps = [
"unidic_lite>=1.0.7",
"uvicorn",
"beautifulsoup4",
"sudachipy>=0.6.6",
"sudachidict_core>=20220729",
"pyknp>=0.6.1",
]
@ -239,7 +242,7 @@ class DepsTableUpdateCommand(Command):
extras = {}
extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic")
extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic", "sudachipy", "sudachidict_core", "pyknp")
extras["sklearn"] = deps_list("scikit-learn")
extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx", "tensorflow-text")

View File

@ -76,4 +76,7 @@ deps = {
"unidic_lite": "unidic_lite>=1.0.7",
"uvicorn": "uvicorn",
"beautifulsoup4": "beautifulsoup4",
"sudachipy": "sudachipy>=0.6.6",
"sudachidict_core": "sudachidict_core>=20220729",
"pyknp": "pyknp>=0.6.1",
}

View File

@ -77,7 +77,7 @@ PRETRAINED_INIT_CONFIGURATION = {
class BertJapaneseTokenizer(BertTokenizer):
r"""
Construct a BERT tokenizer for Japanese text, based on a MecabTokenizer.
Construct a BERT tokenizer for Japanese text.
Args:
vocab_file (`str`):
@ -89,11 +89,15 @@ class BertJapaneseTokenizer(BertTokenizer):
do_subword_tokenize (`bool`, *optional*, defaults to `True`):
Whether to do subword tokenization.
word_tokenizer_type (`str`, *optional*, defaults to `"basic"`):
Type of word tokenizer.
Type of word tokenizer. Choose from ["basic", "mecab", "sudachi", "jumanpp"].
subword_tokenizer_type (`str`, *optional*, defaults to `"wordpiece"`):
Type of subword tokenizer.
mecab_kwargs (`str`, *optional*):
Type of subword tokenizer. Choose from ["wordpiece", "character"].
mecab_kwargs (`dict`, *optional*):
Dictionary passed to the `MecabTokenizer` constructor.
sudachi_kwargs (`dict`, *optional*):
Dictionary passed to the `SudachiTokenizer` constructor.
jumanpp_kwargs (`dict`, *optional*):
Dictionary passed to the `JumanppTokenizer` constructor.
"""
vocab_files_names = VOCAB_FILES_NAMES
@ -116,6 +120,8 @@ class BertJapaneseTokenizer(BertTokenizer):
cls_token="[CLS]",
mask_token="[MASK]",
mecab_kwargs=None,
sudachi_kwargs=None,
jumanpp_kwargs=None,
**kwargs
):
super(BertTokenizer, self).__init__(
@ -131,6 +137,8 @@ class BertJapaneseTokenizer(BertTokenizer):
subword_tokenizer_type=subword_tokenizer_type,
never_split=never_split,
mecab_kwargs=mecab_kwargs,
sudachi_kwargs=sudachi_kwargs,
jumanpp_kwargs=jumanpp_kwargs,
**kwargs,
)
# ^^ We call the grandparent's init, not the parent's.
@ -148,6 +156,8 @@ class BertJapaneseTokenizer(BertTokenizer):
self.lower_case = do_lower_case
self.never_split = never_split
self.mecab_kwargs = copy.deepcopy(mecab_kwargs)
self.sudachi_kwargs = copy.deepcopy(sudachi_kwargs)
self.jumanpp_kwargs = copy.deepcopy(jumanpp_kwargs)
if do_word_tokenize:
if word_tokenizer_type == "basic":
self.word_tokenizer = BasicTokenizer(
@ -157,6 +167,14 @@ class BertJapaneseTokenizer(BertTokenizer):
self.word_tokenizer = MecabTokenizer(
do_lower_case=do_lower_case, never_split=never_split, **(mecab_kwargs or {})
)
elif word_tokenizer_type == "sudachi":
self.word_tokenizer = SudachiTokenizer(
do_lower_case=do_lower_case, never_split=never_split, **(sudachi_kwargs or {})
)
elif word_tokenizer_type == "jumanpp":
self.word_tokenizer = JumanppTokenizer(
do_lower_case=do_lower_case, never_split=never_split, **(jumanpp_kwargs or {})
)
else:
raise ValueError(f"Invalid word_tokenizer_type '{word_tokenizer_type}' is specified.")
@ -176,7 +194,7 @@ class BertJapaneseTokenizer(BertTokenizer):
def __getstate__(self):
state = dict(self.__dict__)
if self.word_tokenizer_type == "mecab":
if self.word_tokenizer_type in ["mecab", "sudachi", "jumanpp"]:
del state["word_tokenizer"]
return state
@ -186,6 +204,14 @@ class BertJapaneseTokenizer(BertTokenizer):
self.word_tokenizer = MecabTokenizer(
do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.mecab_kwargs or {})
)
elif self.word_tokenizer_type == "sudachi":
self.word_tokenizer = SudachiTokenizer(
do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.sudachi_kwargs or {})
)
elif self.word_tokenizer_type == "jumanpp":
self.word_tokenizer = JumanppTokenizer(
do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.jumanpp_kwargs or {})
)
def _tokenize(self, text):
if self.do_word_tokenize:
@ -309,6 +335,157 @@ class MecabTokenizer:
return tokens
class SudachiTokenizer:
"""Runs basic tokenization with Sudachi morphological parser."""
def __init__(
self,
do_lower_case=False,
never_split=None,
normalize_text=True,
trim_whitespace=False,
sudachi_split_mode="A",
sudachi_config_path=None,
sudachi_resource_dir=None,
sudachi_dict_type="core",
):
"""
Constructs a SudachiTokenizer.
Args:
**do_lower_case**: (*optional*) boolean (default True)
Whether to lowercase the input.
**never_split**: (*optional*) list of str
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
**normalize_text**: (*optional*) boolean (default True)
Whether to apply unicode normalization to text before tokenization.
**trim_whitespace**: (*optional*) boolean (default False)
Whether to trim all whitespace, tab, newline from tokens.
**sudachi_split_mode**: (*optional*) string
Split mode of sudachi, choose from "A", "B", "C".
**sudachi_config_path**: (*optional*) string
**sudachi_resource_dir**: (*optional*) string
**sudachi_dict_type**: (*optional*) string
dict type of sudachi, choose from "small", "core", "full".
"""
self.do_lower_case = do_lower_case
self.never_split = never_split if never_split is not None else []
self.normalize_text = normalize_text
self.trim_whitespace = trim_whitespace
try:
from sudachipy import dictionary, tokenizer
except ImportError:
raise ImportError(
"You need to install sudachipy to use SudachiTokenizer. "
"See https://github.com/WorksApplications/SudachiPy for installation."
)
if sudachi_split_mode == "A":
self.split_mode = tokenizer.Tokenizer.SplitMode.A
elif sudachi_split_mode == "B":
self.split_mode = tokenizer.Tokenizer.SplitMode.B
elif sudachi_split_mode == "C":
self.split_mode = tokenizer.Tokenizer.SplitMode.C
else:
raise ValueError("Invalid sudachi_split_mode is specified.")
self.sudachi = dictionary.Dictionary(
config_path=sudachi_config_path, resource_dir=sudachi_resource_dir, dict_type=sudachi_dict_type
).create(self.split_mode)
def tokenize(self, text, never_split=None, **kwargs):
"""Tokenizes a piece of text."""
if self.normalize_text:
text = unicodedata.normalize("NFKC", text)
never_split = self.never_split + (never_split if never_split is not None else [])
tokens = []
for word in self.sudachi.tokenize(text):
token = word.surface()
if self.do_lower_case and token not in never_split:
token = token.lower()
if self.trim_whitespace:
if token.strip() == "":
continue
else:
token = token.strip()
tokens.append(token)
return tokens
class JumanppTokenizer:
"""Runs basic tokenization with jumanpp morphological parser."""
def __init__(
self,
do_lower_case=False,
never_split=None,
normalize_text=True,
trim_whitespace=False,
):
"""
Constructs a JumanppTokenizer.
Args:
**do_lower_case**: (*optional*) boolean (default True)
Whether to lowercase the input.
**never_split**: (*optional*) list of str
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
**normalize_text**: (*optional*) boolean (default True)
Whether to apply unicode normalization to text before tokenization.
**trim_whitespace**: (*optional*) boolean (default False)
Whether to trim all whitespace, tab, newline from tokens.
"""
self.do_lower_case = do_lower_case
self.never_split = never_split if never_split is not None else []
self.normalize_text = normalize_text
self.trim_whitespace = trim_whitespace
try:
import pyknp
except ImportError:
raise ImportError(
"You need to install pyknp to use JumanppTokenizer. "
"See https://github.com/ku-nlp/pyknp for installation."
)
self.juman = pyknp.Juman(jumanpp=True)
def tokenize(self, text, never_split=None, **kwargs):
"""Tokenizes a piece of text."""
if self.normalize_text:
text = unicodedata.normalize("NFKC", text)
never_split = self.never_split + (never_split if never_split is not None else [])
tokens = []
for mrph in self.juman.analysis(text).mrph_list():
token = mrph.midasi
if self.do_lower_case and token not in never_split:
token = token.lower()
if self.trim_whitespace:
if token.strip() == "":
continue
else:
token = token.strip()
tokens.append(token)
return tokens
class CharacterTokenizer:
"""Runs Character tokenization."""

View File

@ -52,6 +52,7 @@ from .utils import (
is_flax_available,
is_ftfy_available,
is_ipex_available,
is_jumanpp_available,
is_librosa_available,
is_onnx_available,
is_pandas_available,
@ -66,6 +67,7 @@ from .utils import (
is_sentencepiece_available,
is_soundfile_availble,
is_spacy_available,
is_sudachi_available,
is_tensorflow_probability_available,
is_tensorflow_text_available,
is_tf2onnx_available,
@ -671,6 +673,20 @@ def require_usr_bin_time(test_case):
return unittest.skipUnless(cmd_exists("/usr/bin/time"), "test requires /usr/bin/time")(test_case)
def require_sudachi(test_case):
"""
Decorator marking a test that requires sudachi
"""
return unittest.skipUnless(is_sudachi_available(), "test requires sudachi")(test_case)
def require_jumanpp(test_case):
"""
Decorator marking a test that requires jumanpp
"""
return unittest.skipUnless(is_jumanpp_available(), "test requires jumanpp")(test_case)
def get_gpu_count():
"""
Return the number of available gpus (regardless of whether torch, tf or jax is used)

View File

@ -98,6 +98,7 @@ from .import_utils import (
is_ftfy_available,
is_in_notebook,
is_ipex_available,
is_jumanpp_available,
is_librosa_available,
is_ninja_available,
is_onnx_available,
@ -121,6 +122,7 @@ from .import_utils import (
is_soundfile_availble,
is_spacy_available,
is_speech_available,
is_sudachi_available,
is_tensorflow_probability_available,
is_tensorflow_text_available,
is_tf2onnx_available,

View File

@ -18,6 +18,7 @@ Import utilities: Utilities related to imports and our lazy inits.
import importlib.util
import json
import os
import shutil
import sys
import warnings
from collections import OrderedDict
@ -671,6 +672,14 @@ def is_ccl_available():
return _is_ccl_available
def is_sudachi_available():
return importlib.util.find_spec("sudachipy") is not None
def is_jumanpp_available():
return (importlib.util.find_spec("pyknp") is not None) and (shutil.which("jumanpp") is not None)
# docstyle-ignore
DATASETS_IMPORT_ERROR = """
{0} requires the 🤗 Datasets library but it was not found in your environment. You can install it with:

View File

@ -24,10 +24,12 @@ from transformers.models.bert_japanese.tokenization_bert_japanese import (
BertJapaneseTokenizer,
BertTokenizer,
CharacterTokenizer,
JumanppTokenizer,
MecabTokenizer,
SudachiTokenizer,
WordpieceTokenizer,
)
from transformers.testing_utils import custom_tokenizers
from transformers.testing_utils import custom_tokenizers, require_jumanpp, require_sudachi
from ...test_tokenization_common import TokenizerTesterMixin
@ -172,6 +174,150 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
["アップルストア", "", "iPhone", "", "", "発売", "", "", "", " ", ""],
)
@require_sudachi
def test_pickle_sudachi_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="sudachi")
self.assertIsNotNone(tokenizer)
text = "こんにちは、世界。\nこんばんは、世界。"
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, ["こんにちは", "", "世界", "", "こん", "##ばんは", "", "世界", ""])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
filename = os.path.join(self.tmpdirname, "tokenizer.bin")
with open(filename, "wb") as handle:
pickle.dump(tokenizer, handle)
with open(filename, "rb") as handle:
tokenizer_new = pickle.load(handle)
tokens_loaded = tokenizer_new.tokenize(text)
self.assertListEqual(tokens, tokens_loaded)
@require_sudachi
def test_sudachi_tokenizer_core(self):
tokenizer = SudachiTokenizer(sudachi_dict_type="core")
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
# fmt: off
[" ", "\t", "アップル", "ストア", "", "iPhone", "8", " ", "", " ", " ", "\n ", "発売", "", "", "", " ", "", " ", " "],
# fmt: on
)
@require_sudachi
def test_sudachi_tokenizer_split_mode_A(self):
tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="A")
self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国", "", "参政", ""])
@require_sudachi
def test_sudachi_tokenizer_split_mode_B(self):
tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="B")
self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国人", "参政権"])
@require_sudachi
def test_sudachi_tokenizer_split_mode_C(self):
tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="C")
self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国人参政権"])
@require_sudachi
def test_sudachi_tokenizer_lower(self):
tokenizer = SudachiTokenizer(do_lower_case=True, sudachi_dict_type="core")
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
# fmt: off
[" ", "\t", "アップル", "ストア", "", "iphone", "8", " ", "", " ", " ", "\n ", "発売", "", "", "", " ", "", " ", " "],
# fmt: on
)
@require_sudachi
def test_sudachi_tokenizer_no_normalize(self):
tokenizer = SudachiTokenizer(normalize_text=False, sudachi_dict_type="core")
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
# fmt: off
[" ", "\t", "アップル", "ストア", "", "iPhone", "", " ", "", " ", " ", "\n ", "発売", "", "", "", "\u3000", "", " ", " "],
# fmt: on
)
@require_sudachi
def test_sudachi_tokenizer_trim_whitespace(self):
tokenizer = SudachiTokenizer(trim_whitespace=True, sudachi_dict_type="core")
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップル", "ストア", "", "iPhone", "8", "", "発売", "", "", "", ""],
)
@require_jumanpp
def test_pickle_jumanpp_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="jumanpp")
self.assertIsNotNone(tokenizer)
text = "こんにちは、世界。\nこんばんは、世界。"
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, ["こんにちは", "", "世界", "", "こん", "##ばんは", "", "世界", ""])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
filename = os.path.join(self.tmpdirname, "tokenizer.bin")
with open(filename, "wb") as handle:
pickle.dump(tokenizer, handle)
with open(filename, "rb") as handle:
tokenizer_new = pickle.load(handle)
tokens_loaded = tokenizer_new.tokenize(text)
self.assertListEqual(tokens, tokens_loaded)
@require_jumanpp
def test_jumanpp_tokenizer(self):
tokenizer = JumanppTokenizer()
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
# fmt: off
["アップル", "ストア", "", "iPhone", "8", "\u3000", "", "\u3000", "\u3000", "\u3000", "発売", "", "れた", "\u3000", ""],
# fmt: on
)
@require_jumanpp
def test_jumanpp_tokenizer_lower(self):
tokenizer = JumanppTokenizer(do_lower_case=True)
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
# fmt: off
["アップル", "ストア", "", "iphone", "8", "\u3000", "", "\u3000", "\u3000", "\u3000", "発売", "", "れた", "\u3000", ""],
# fmt: on
)
@require_jumanpp
def test_jumanpp_tokenizer_no_normalize(self):
tokenizer = JumanppTokenizer(normalize_text=False)
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
# fmt: off
["", "", "", "", "", "ストア", "", "iPhone", "", "\u3000", "", "\u3000", "\u3000", "\u3000", "発売", "", "れた", "\u3000", ""],
# fmt: on
)
@require_jumanpp
def test_jumanpp_tokenizer_trim_whitespace(self):
tokenizer = JumanppTokenizer(trim_whitespace=True)
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップル", "ストア", "", "iPhone", "8", "", "発売", "", "れた", ""],
)
def test_wordpiece_tokenizer(self):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こんにちは", "こん", "にちは", "ばんは", "##こん", "##にちは", "##ばんは"]