mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-23 14:29:01 +06:00
Support additional dictionaries for BERT Japanese tokenizers (#6515)
* Update BERT Japanese tokenizers * Update CircleCI config to download unidic * Specify to use the latest dictionary packages
This commit is contained in:
parent
423eb5b1d7
commit
48c6c6139f
@ -150,6 +150,7 @@ jobs:
|
|||||||
- v0.3-{{ checksum "setup.py" }}
|
- v0.3-{{ checksum "setup.py" }}
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install .[ja,testing]
|
- run: pip install .[ja,testing]
|
||||||
|
- run: python -m unidic download
|
||||||
- save_cache:
|
- save_cache:
|
||||||
key: v0.3-custom_tokenizers-{{ checksum "setup.py" }}
|
key: v0.3-custom_tokenizers-{{ checksum "setup.py" }}
|
||||||
paths:
|
paths:
|
||||||
|
2
setup.py
2
setup.py
@ -65,7 +65,7 @@ if stale_egg_info.exists():
|
|||||||
|
|
||||||
extras = {}
|
extras = {}
|
||||||
|
|
||||||
extras["ja"] = ["fugashi>=1.0", "ipadic>=1.0,<2.0"]
|
extras["ja"] = ["fugashi>=1.0", "ipadic>=1.0.0,<2.0", "unidic_lite>=1.0.7", "unidic>=1.0.2"]
|
||||||
extras["sklearn"] = ["scikit-learn"]
|
extras["sklearn"] = ["scikit-learn"]
|
||||||
|
|
||||||
# keras2onnx and onnxconverter-common version is specific through a commit until 1.7.0 lands on pypi
|
# keras2onnx and onnxconverter-common version is specific through a commit until 1.7.0 lands on pypi
|
||||||
|
@ -167,30 +167,89 @@ class BertJapaneseTokenizer(BertTokenizer):
|
|||||||
class MecabTokenizer:
|
class MecabTokenizer:
|
||||||
"""Runs basic tokenization with MeCab morphological parser."""
|
"""Runs basic tokenization with MeCab morphological parser."""
|
||||||
|
|
||||||
def __init__(self, do_lower_case=False, never_split=None, normalize_text=True, mecab_option: Optional[str] = None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_lower_case=False,
|
||||||
|
never_split=None,
|
||||||
|
normalize_text=True,
|
||||||
|
mecab_dic: Optional[str] = "ipadic",
|
||||||
|
mecab_option: Optional[str] = None,
|
||||||
|
):
|
||||||
"""Constructs a MecabTokenizer.
|
"""Constructs a MecabTokenizer.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
**do_lower_case**: (`optional`) boolean (default True)
|
**do_lower_case**: (`optional`) boolean (default True)
|
||||||
Whether to lower case the input.
|
Whether to lowercase the input.
|
||||||
**never_split**: (`optional`) list of str
|
**never_split**: (`optional`) list of str
|
||||||
Kept for backward compatibility purposes.
|
Kept for backward compatibility purposes.
|
||||||
Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
|
Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
|
||||||
List of token not to split.
|
List of tokens not to split.
|
||||||
**normalize_text**: (`optional`) boolean (default True)
|
**normalize_text**: (`optional`) boolean (default True)
|
||||||
Whether to apply unicode normalization to text before tokenization.
|
Whether to apply unicode normalization to text before tokenization.
|
||||||
**mecab_option**: (`optional`) string passed to `MeCab.Tagger` constructor (default "")
|
**mecab_dic**: (`optional`) string (default "ipadic")
|
||||||
|
Name of dictionary to be used for MeCab initialization.
|
||||||
|
If you are using a system-installed dictionary, set thi option to `None` and modify `mecab_option`.
|
||||||
|
**mecab_option**: (`optional`) string
|
||||||
|
String passed to MeCab constructor.
|
||||||
"""
|
"""
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
self.never_split = never_split if never_split is not None else []
|
self.never_split = never_split if never_split is not None else []
|
||||||
self.normalize_text = normalize_text
|
self.normalize_text = normalize_text
|
||||||
|
|
||||||
import fugashi
|
try:
|
||||||
import ipadic
|
import fugashi
|
||||||
|
except ModuleNotFoundError as error:
|
||||||
|
raise error(
|
||||||
|
"You need to install fugashi to use MecabTokenizer."
|
||||||
|
"See https://pypi.org/project/fugashi/ for installation."
|
||||||
|
)
|
||||||
|
|
||||||
# Use ipadic by default (later options can override it)
|
|
||||||
mecab_option = mecab_option or ""
|
mecab_option = mecab_option or ""
|
||||||
mecab_option = ipadic.MECAB_ARGS + " " + mecab_option
|
|
||||||
|
if mecab_dic is not None:
|
||||||
|
if mecab_dic == "ipadic":
|
||||||
|
try:
|
||||||
|
import ipadic
|
||||||
|
except ModuleNotFoundError as error:
|
||||||
|
raise error(
|
||||||
|
"The ipadic dictionary is not installed. "
|
||||||
|
"See https://github.com/polm/ipadic-py for installation."
|
||||||
|
)
|
||||||
|
|
||||||
|
dic_dir = ipadic.DICDIR
|
||||||
|
|
||||||
|
elif mecab_dic == "unidic_lite":
|
||||||
|
try:
|
||||||
|
import unidic_lite
|
||||||
|
except ModuleNotFoundError as error:
|
||||||
|
raise error(
|
||||||
|
"The unidic_lite dictionary is not installed. "
|
||||||
|
"See https://github.com/polm/unidic-lite for installation."
|
||||||
|
)
|
||||||
|
|
||||||
|
dic_dir = unidic_lite.DICDIR
|
||||||
|
|
||||||
|
elif mecab_dic == "unidic":
|
||||||
|
try:
|
||||||
|
import unidic
|
||||||
|
except ModuleNotFoundError as error:
|
||||||
|
raise error(
|
||||||
|
"The unidic dictionary is not installed. "
|
||||||
|
"See https://github.com/polm/unidic-py for installation."
|
||||||
|
)
|
||||||
|
|
||||||
|
dic_dir = unidic.DICDIR
|
||||||
|
if not os.path.isdir(dic_dir):
|
||||||
|
raise RuntimeError(
|
||||||
|
"The unidic dictionary itself is not found."
|
||||||
|
"See https://github.com/polm/unidic-py for installation."
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError("Invalid mecab_dic is specified.")
|
||||||
|
|
||||||
|
mecabrc = os.path.join(dic_dir, "mecabrc")
|
||||||
|
mecab_option = "-d {} -r {} ".format(dic_dir, mecabrc) + mecab_option
|
||||||
|
|
||||||
self.mecab = fugashi.GenericTagger(mecab_option)
|
self.mecab = fugashi.GenericTagger(mecab_option)
|
||||||
|
|
||||||
@ -213,7 +272,7 @@ class MecabTokenizer:
|
|||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
class CharacterTokenizer(object):
|
class CharacterTokenizer:
|
||||||
"""Runs Character tokenziation."""
|
"""Runs Character tokenziation."""
|
||||||
|
|
||||||
def __init__(self, vocab, unk_token, normalize_text=True):
|
def __init__(self, vocab, unk_token, normalize_text=True):
|
||||||
@ -247,7 +306,7 @@ class CharacterTokenizer(object):
|
|||||||
text = unicodedata.normalize("NFKC", text)
|
text = unicodedata.normalize("NFKC", text)
|
||||||
|
|
||||||
output_tokens = []
|
output_tokens = []
|
||||||
for i, char in enumerate(text):
|
for char in text:
|
||||||
if char not in self.vocab:
|
if char not in self.vocab:
|
||||||
output_tokens.append(self.unk_token)
|
output_tokens.append(self.unk_token)
|
||||||
continue
|
continue
|
||||||
|
@ -87,16 +87,38 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
|
self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
|
||||||
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
|
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
|
||||||
|
|
||||||
def test_mecab_tokenizer(self):
|
def test_mecab_tokenizer_ipadic(self):
|
||||||
tokenizer = MecabTokenizer()
|
tokenizer = MecabTokenizer(mecab_dic="ipadic")
|
||||||
|
|
||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),
|
tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),
|
||||||
["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],
|
["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_mecab_tokenizer_unidic_lite(self):
|
||||||
|
try:
|
||||||
|
tokenizer = MecabTokenizer(mecab_dic="unidic_lite")
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.assertListEqual(
|
||||||
|
tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),
|
||||||
|
["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_mecab_tokenizer_unidic(self):
|
||||||
|
try:
|
||||||
|
tokenizer = MecabTokenizer(mecab_dic="unidic")
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.assertListEqual(
|
||||||
|
tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),
|
||||||
|
["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],
|
||||||
|
)
|
||||||
|
|
||||||
def test_mecab_tokenizer_lower(self):
|
def test_mecab_tokenizer_lower(self):
|
||||||
tokenizer = MecabTokenizer(do_lower_case=True)
|
tokenizer = MecabTokenizer(do_lower_case=True, mecab_dic="ipadic")
|
||||||
|
|
||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),
|
tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),
|
||||||
@ -118,7 +140,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def test_mecab_tokenizer_no_normalize(self):
|
def test_mecab_tokenizer_no_normalize(self):
|
||||||
tokenizer = MecabTokenizer(normalize_text=False)
|
tokenizer = MecabTokenizer(normalize_text=False, mecab_dic="ipadic")
|
||||||
|
|
||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),
|
tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),
|
||||||
|
Loading…
Reference in New Issue
Block a user