transformers/tests/test_tokenization_bert_japanese.py
Thomas Wolf ba8c4d0ac0
[Dependencies|tokenizers] Make both SentencePiece and Tokenizers optional dependencies (#7659)
* splitting fast and slow tokenizers [WIP]

* [WIP] splitting sentencepiece and tokenizers dependencies

* update dummy objects

* add name_or_path to models and tokenizers

* prefix added to file names

* prefix

* styling + quality

* spliting all the tokenizer files - sorting sentencepiece based ones

* update tokenizer version up to 0.9.0

* remove hard dependency on sentencepiece 🎉

* and removed hard dependency on tokenizers 🎉

* update conversion script

* update missing models

* fixing tests

* move test_tokenization_fast to main tokenization tests - fix bugs

* bump up tokenizers

* fix bert_generation

* update ad fix several tokenizers

* keep sentencepiece in deps for now

* fix funnel and deberta tests

* fix fsmt

* fix marian tests

* fix layoutlm

* fix squeezebert and gpt2

* fix T5 tokenization

* fix xlnet tests

* style

* fix mbart

* bump up tokenizers to 0.9.2

* fix model tests

* fix tf models

* fix seq2seq examples

* fix tests without sentencepiece

* fix slow => fast  conversion without sentencepiece

* update auto and bert generation tests

* fix mbart tests

* fix auto and common test without tokenizers

* fix tests without tokenizers

* clean up tests lighten up when tokenizers + sentencepiece are both off

* style quality and tests fixing

* add sentencepiece to doc/examples reqs

* leave sentencepiece on for now

* style quality split hebert and fix pegasus

* WIP Herbert fast

* add sample_text_no_unicode and fix hebert tokenization

* skip FSMT example test for now

* fix style

* fix fsmt in example tests

* update following Lysandre and Sylvain's comments

* Update src/transformers/testing_utils.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/testing_utils.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/tokenization_utils_base.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/tokenization_utils_base.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2020-10-18 20:51:24 +02:00

270 lines
11 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import pickle
import unittest
from transformers.testing_utils import custom_tokenizers
from transformers.tokenization_bert_japanese import (
VOCAB_FILES_NAMES,
BertJapaneseTokenizer,
CharacterTokenizer,
MecabTokenizer,
WordpieceTokenizer,
)
from .test_tokenization_common import TokenizerTesterMixin
@custom_tokenizers
class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BertJapaneseTokenizer
space_between_special_tokens = True
def setUp(self):
super().setUp()
vocab_tokens = [
"[UNK]",
"[CLS]",
"[SEP]",
"こんにちは",
"こん",
"にちは",
"ばんは",
"##こん",
"##にちは",
"##ばんは",
"世界",
"##世界",
"",
"##、",
"",
"##。",
]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_input_output_texts(self, tokenizer):
input_text = "こんにちは、世界。 \nこんばんは、世界。"
output_text = "こんにちは 、 世界 。 こんばんは 、 世界 。"
return input_text, output_text
def get_clean_sequence(self, tokenizer):
input_text, output_text = self.get_input_output_texts(tokenizer)
ids = tokenizer.encode(output_text, add_special_tokens=False)
text = tokenizer.decode(ids, clean_up_tokenization_spaces=False)
return text, ids
def test_pretokenized_inputs(self):
pass # TODO add if relevant
def test_maximum_encoding_length_pair_input(self):
pass # TODO add if relevant
def test_maximum_encoding_length_single_input(self):
pass # TODO add if relevant
def test_full_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file)
tokens = tokenizer.tokenize("こんにちは、世界。\nこんばんは、世界。")
self.assertListEqual(tokens, ["こんにちは", "", "世界", "", "こん", "##ばんは", "", "世界", ""])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
def test_pickle_mecab_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="mecab")
self.assertIsNotNone(tokenizer)
text = "こんにちは、世界。\nこんばんは、世界。"
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, ["こんにちは", "", "世界", "", "こん", "##ばんは", "", "世界", ""])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
filename = os.path.join(self.tmpdirname, "tokenizer.bin")
with open(filename, "wb") as handle:
pickle.dump(tokenizer, handle)
with open(filename, "rb") as handle:
tokenizer_new = pickle.load(handle)
tokens_loaded = tokenizer_new.tokenize(text)
self.assertListEqual(tokens, tokens_loaded)
def test_mecab_tokenizer_ipadic(self):
tokenizer = MecabTokenizer(mecab_dic="ipadic")
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップルストア", "", "iPhone", "8", "", "発売", "", "", "", ""],
)
def test_mecab_tokenizer_unidic_lite(self):
try:
tokenizer = MecabTokenizer(mecab_dic="unidic_lite")
except ModuleNotFoundError:
return
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップル", "ストア", "", "iPhone", "8", "", "発売", "", "", "", ""],
)
def test_mecab_tokenizer_unidic(self):
try:
tokenizer = MecabTokenizer(mecab_dic="unidic")
except ModuleNotFoundError:
return
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップル", "ストア", "", "iPhone", "8", "", "発売", "", "", "", ""],
)
def test_mecab_tokenizer_lower(self):
tokenizer = MecabTokenizer(do_lower_case=True, mecab_dic="ipadic")
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップルストア", "", "iphone", "8", "", "発売", "", "", "", ""],
)
def test_mecab_tokenizer_with_option(self):
try:
tokenizer = MecabTokenizer(
do_lower_case=True, normalize_text=False, mecab_option="-d /usr/local/lib/mecab/dic/jumandic"
)
except RuntimeError:
# if dict doesn't exist in the system, previous code raises this error.
return
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップルストア", "", "iPhone", "", "", "発売", "", "れた", "\u3000", ""],
)
def test_mecab_tokenizer_no_normalize(self):
tokenizer = MecabTokenizer(normalize_text=False, mecab_dic="ipadic")
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップルストア", "", "iPhone", "", "", "発売", "", "", "", " ", ""],
)
def test_wordpiece_tokenizer(self):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こんにちは", "こん", "にちは" "ばんは", "##こん", "##にちは", "##ばんは"]
vocab = {}
for (i, token) in enumerate(vocab_tokens):
vocab[token] = i
tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
self.assertListEqual(tokenizer.tokenize(""), [])
self.assertListEqual(tokenizer.tokenize("こんにちは"), ["こんにちは"])
self.assertListEqual(tokenizer.tokenize("こんばんは"), ["こん", "##ばんは"])
self.assertListEqual(tokenizer.tokenize("こんばんは こんばんにちは こんにちは"), ["こん", "##ばんは", "[UNK]", "こんにちは"])
def test_sequence_builders(self):
tokenizer = self.tokenizer_class.from_pretrained("cl-tohoku/bert-base-japanese")
text = tokenizer.encode("ありがとう。", add_special_tokens=False)
text_2 = tokenizer.encode("どういたしまして。", add_special_tokens=False)
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
# 2 is for "[CLS]", 3 is for "[SEP]"
assert encoded_sentence == [2] + text + [3]
assert encoded_pair == [2] + text + [3] + text_2 + [3]
@custom_tokenizers
class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BertJapaneseTokenizer
def setUp(self):
super().setUp()
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "", "", "", "", "", "", "", "", "", ""]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_tokenizer(self, **kwargs):
return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, subword_tokenizer_type="character", **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "こんにちは、世界。 \nこんばんは、世界。"
output_text = "こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。"
return input_text, output_text
def test_pretokenized_inputs(self):
pass # TODO add if relevant
def test_maximum_encoding_length_pair_input(self):
pass # TODO add if relevant
def test_maximum_encoding_length_single_input(self):
pass # TODO add if relevant
def test_full_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file, subword_tokenizer_type="character")
tokens = tokenizer.tokenize("こんにちは、世界。 \nこんばんは、世界。")
self.assertListEqual(
tokens, ["", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""]
)
self.assertListEqual(
tokenizer.convert_tokens_to_ids(tokens), [3, 4, 5, 6, 7, 11, 9, 10, 12, 3, 4, 8, 4, 7, 11, 9, 10, 12]
)
def test_character_tokenizer(self):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "", "", "", "", "", "", "", "" "", ""]
vocab = {}
for (i, token) in enumerate(vocab_tokens):
vocab[token] = i
tokenizer = CharacterTokenizer(vocab=vocab, unk_token="[UNK]")
self.assertListEqual(tokenizer.tokenize(""), [])
self.assertListEqual(tokenizer.tokenize("こんにちは"), ["", "", "", "", ""])
self.assertListEqual(tokenizer.tokenize("こんにちほ"), ["", "", "", "", "[UNK]"])
def test_sequence_builders(self):
tokenizer = self.tokenizer_class.from_pretrained("cl-tohoku/bert-base-japanese-char")
text = tokenizer.encode("ありがとう。", add_special_tokens=False)
text_2 = tokenizer.encode("どういたしまして。", add_special_tokens=False)
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
# 2 is for "[CLS]", 3 is for "[SEP]"
assert encoded_sentence == [2] + text + [3]
assert encoded_pair == [2] + text + [3] + text_2 + [3]