transformers/tests/models/cpm/test_tokenization_cpm.py
Pavel Iakubovskii 48461c0fe2
Make pipeline able to load processor (#32514)
* Refactor get_test_pipeline

* Fixup

* Fixing tests

* Add processor loading in tests

* Restructure processors loading

* Add processor to the pipeline

* Move model loading on tom of the test

* Update `get_test_pipeline`

* Fixup

* Add class-based flags for loading processors

* Change `is_pipeline_test_to_skip` signature

* Skip t5 failing test for slow tokenizer

* Fixup

* Fix copies for T5

* Fix typo

* Add try/except for tokenizer loading (kosmos-2 case)

* Fixup

* Llama not fails for long generation

* Revert processor pass in text-generation test

* Fix docs

* Switch back to json file for image processors and feature extractors

* Add processor type check

* Remove except for tokenizers

* Fix docstring

* Fix empty lists for tests

* Fixup

* Fix load check

* Ensure we have non-empty test cases

* Update src/transformers/pipelines/__init__.py

Co-authored-by: Lysandre Debut <hi@lysand.re>

* Update src/transformers/pipelines/base.py

Co-authored-by: Lysandre Debut <hi@lysand.re>

* Rework comment

* Better docs, add note about pipeline components

* Change warning to error raise

* Fixup

* Refine pipeline docs

---------

Co-authored-by: Lysandre Debut <hi@lysand.re>
2024-10-09 16:46:11 +01:00

52 lines
1.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
# Copyright 2018 HuggingFace Inc. team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers.models.cpm.tokenization_cpm import CpmTokenizer
from transformers.testing_utils import custom_tokenizers
@custom_tokenizers
class CpmTokenizationTest(unittest.TestCase):
# There is no `CpmModel`
def is_pipeline_test_to_skip(
self,
pipeline_test_case_name,
config_class,
model_architecture,
tokenizer_name,
image_processor_name,
feature_extractor_name,
processor_name,
):
return True
def test_pre_tokenization(self):
tokenizer = CpmTokenizer.from_pretrained("TsinghuaAI/CPM-Generate")
text = "Hugging Face大法好谁用谁知道。"
normalized_text = "Hugging Face大法好,谁用谁知道。<unk>"
bpe_tokens = "▁Hu gg ing ▁ ▂ ▁F ace ▁大法 ▁好 ▁ , ▁谁 ▁用 ▁谁 ▁知 道 ▁ 。".split()
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, bpe_tokens)
input_tokens = tokens + [tokenizer.unk_token]
input_bpe_tokens = [13789, 13283, 1421, 8, 10, 1164, 13608, 16528, 63, 8, 9, 440, 108, 440, 121, 90, 8, 12, 0]
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
reconstructed_text = tokenizer.decode(input_bpe_tokens)
self.assertEqual(reconstructed_text, normalized_text)