diff --git a/.circleci/config.yml b/.circleci/config.yml index b9cf5126878..57a0e7d3133 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -549,7 +549,7 @@ jobs: - v0.4-custom_tokenizers-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }} - run: pip install --upgrade pip - - run: pip install .[ja,testing,sentencepiece,jieba,spacy,ftfy] + - run: pip install .[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba] - run: python -m unidic download - save_cache: key: v0.4-custom_tokenizers-{{ checksum "setup.py" }} @@ -785,7 +785,7 @@ jobs: - v0.4-torch-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }} - run: pip install --upgrade pip - - run: pip install .[torch,testing,sentencepiece,onnxruntime,vision] + - run: pip install .[torch,testing,sentencepiece,onnxruntime,vision,rjieba] - save_cache: key: v0.4-onnx-{{ checksum "setup.py" }} paths: diff --git a/setup.py b/setup.py index 02dd9ff2213..bb3598fda20 100644 --- a/setup.py +++ b/setup.py @@ -140,6 +140,7 @@ _deps = [ "ray[tune]", "regex!=2019.12.17", "requests", + "rjieba", "rouge-score", "sacrebleu>=1.4.12,<2.0.0", "sacremoses", @@ -288,7 +289,8 @@ extras["testing"] = ( "nltk", "GitPython", "hf-doc-builder", - 'sacremoses' + "sacremoses", + "rjieba" ) + extras["retrieval"] + extras["modelcreation"] diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 2cd16075744..4b3498e1f8e 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -47,6 +47,7 @@ deps = { "ray[tune]": "ray[tune]", "regex": "regex!=2019.12.17", "requests": "requests", + "rjieba": "rjieba", "rouge-score": "rouge-score", "sacrebleu": "sacrebleu>=1.4.12,<2.0.0", "sacremoses": "sacremoses", diff --git a/src/transformers/models/roformer/tokenization_roformer_fast.py b/src/transformers/models/roformer/tokenization_roformer_fast.py index 26c37d4580f..59644df7465 100644 --- a/src/transformers/models/roformer/tokenization_roformer_fast.py +++ b/src/transformers/models/roformer/tokenization_roformer_fast.py @@ -27,7 +27,7 @@ from .tokenization_utils import JiebaPreTokenizer logger = logging.get_logger(__name__) -VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { diff --git a/tests/models/roformer/test_tokenization_roformer.py b/tests/models/roformer/test_tokenization_roformer.py index db31b34de8c..7546bc2e41d 100644 --- a/tests/models/roformer/test_tokenization_roformer.py +++ b/tests/models/roformer/test_tokenization_roformer.py @@ -71,3 +71,7 @@ class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): # can't train new_tokenizer via Tokenizers lib def test_training_new_tokenizer_with_special_tokens_change(self): pass + + # can't serialise custom PreTokenizer + def test_save_slow_from_fast_and_reload_fast(self): + pass diff --git a/tests/onnx/test_onnx_v2.py b/tests/onnx/test_onnx_v2.py index ea5a5476393..4ecfc917d56 100644 --- a/tests/onnx/test_onnx_v2.py +++ b/tests/onnx/test_onnx_v2.py @@ -16,7 +16,7 @@ from transformers.onnx import ( validate_model_outputs, ) from transformers.onnx.utils import compute_effective_axis_dimension, compute_serialized_parameters_size -from transformers.testing_utils import require_onnx, require_tf, require_torch, require_vision, slow +from transformers.testing_utils import require_onnx, require_rjieba, require_tf, require_torch, require_vision, slow if is_torch_available() or is_tf_available(): @@ -287,6 +287,7 @@ class OnnxExportTestCaseV2(TestCase): @slow @require_torch @require_vision + @require_rjieba def test_pytorch_export(self, test_name, name, model_name, feature, onnx_config_class_constructor): self._onnx_export(test_name, name, model_name, feature, onnx_config_class_constructor)