mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-13 17:48:22 +06:00

* Adding support for raw python `generator` in addition to `Dataset` The main goal is to ease the create of streaming data to the pipe. `Dataset` is more involved and pytorch specific. This PR, provides a way to use a python iterator too. This enabled #14250 but can be proposed as a standalone PR. ```python from transformers import pipeline def read_data(filename): with open(filename, 'r') as f: for line in f: yield f pipe = pipeline("text-classification") for classified in pipe(read_data("large_file.txt")): print("Success ! ", classified) ``` The main caveat of this, is the interaction with `DataLoader` with `num_workers>1`. When you have multiple workers, each receive a copy of the generator (like `IterableDataset`). That means the naive Iterator will fail since all workers iterate on all items of the generator. There are ways to do clever "skipping", but it could be bad still because all workers still do have to pass through all items of the generator (they just ignore items they don't handle), depending on the case it might be bad. Using `num_workers=1` is the simplest fix and if the cost of loading your data is small enough should be good enough. In the above example trying to do smart tricks to skip some lines is unlikely to be a net positive for instance. If there are better ways to do "jumps" on some data, then using `Dataset` is more advised (since then differents workers can just jump themselves). * Adding iterator support for `tf` too.
406 lines
16 KiB
Python
406 lines
16 KiB
Python
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import copy
|
|
import importlib
|
|
import logging
|
|
import random
|
|
import string
|
|
import unittest
|
|
from abc import abstractmethod
|
|
from functools import lru_cache
|
|
from unittest import skipIf
|
|
|
|
from transformers import (
|
|
FEATURE_EXTRACTOR_MAPPING,
|
|
TOKENIZER_MAPPING,
|
|
AutoFeatureExtractor,
|
|
AutoTokenizer,
|
|
IBertConfig,
|
|
RobertaConfig,
|
|
TextClassificationPipeline,
|
|
pipeline,
|
|
)
|
|
from transformers.pipelines import get_task
|
|
from transformers.pipelines.base import _pad
|
|
from transformers.testing_utils import is_pipeline_test, nested_simplify, require_tf, require_torch
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def get_checkpoint_from_architecture(architecture):
|
|
try:
|
|
module = importlib.import_module(architecture.__module__)
|
|
except ImportError:
|
|
logger.error(f"Ignoring architecture {architecture}")
|
|
return
|
|
|
|
if hasattr(module, "_CHECKPOINT_FOR_DOC"):
|
|
return module._CHECKPOINT_FOR_DOC
|
|
else:
|
|
logger.warning(f"Can't retrieve checkpoint from {architecture.__name__}")
|
|
|
|
|
|
def get_tiny_config_from_class(configuration_class):
|
|
if "OpenAIGPT" in configuration_class.__name__:
|
|
# This is the only file that is inconsistent with the naming scheme.
|
|
# Will rename this file if we decide this is the way to go
|
|
return
|
|
|
|
model_type = configuration_class.model_type
|
|
camel_case_model_name = configuration_class.__name__.split("Config")[0]
|
|
|
|
try:
|
|
module = importlib.import_module(f".test_modeling_{model_type.replace('-', '_')}", package="tests")
|
|
model_tester_class = getattr(module, f"{camel_case_model_name}ModelTester", None)
|
|
except (ImportError, AttributeError):
|
|
logger.error(f"No model tester class for {configuration_class.__name__}")
|
|
return
|
|
|
|
if model_tester_class is None:
|
|
logger.warning(f"No model tester class for {configuration_class.__name__}")
|
|
return
|
|
|
|
model_tester = model_tester_class(parent=None)
|
|
|
|
if hasattr(model_tester, "get_pipeline_config"):
|
|
return model_tester.get_pipeline_config()
|
|
elif hasattr(model_tester, "get_config"):
|
|
return model_tester.get_config()
|
|
else:
|
|
logger.warning(f"Model tester {model_tester_class.__name__} has no `get_config()`.")
|
|
|
|
|
|
@lru_cache(maxsize=100)
|
|
def get_tiny_tokenizer_from_checkpoint(checkpoint):
|
|
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
|
if tokenizer.vocab_size < 300:
|
|
# Wav2Vec2ForCTC for instance
|
|
# ByT5Tokenizer
|
|
# all are already small enough and have no Fast version that can
|
|
# be retrained
|
|
return tokenizer
|
|
logger.info("Training new from iterator ...")
|
|
vocabulary = string.ascii_letters + string.digits + " "
|
|
tokenizer = tokenizer.train_new_from_iterator(vocabulary, vocab_size=len(vocabulary), show_progress=False)
|
|
logger.info("Trained.")
|
|
return tokenizer
|
|
|
|
|
|
def get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config):
|
|
try:
|
|
feature_extractor = AutoFeatureExtractor.from_pretrained(checkpoint)
|
|
except Exception:
|
|
feature_extractor = None
|
|
if hasattr(tiny_config, "image_size") and feature_extractor:
|
|
feature_extractor = feature_extractor.__class__(size=tiny_config.image_size, crop_size=tiny_config.image_size)
|
|
|
|
# Speech2TextModel specific.
|
|
if hasattr(tiny_config, "input_feat_per_channel") and feature_extractor:
|
|
feature_extractor = feature_extractor.__class__(
|
|
feature_size=tiny_config.input_feat_per_channel, num_mel_bins=tiny_config.input_feat_per_channel
|
|
)
|
|
return feature_extractor
|
|
|
|
|
|
class ANY:
|
|
def __init__(self, _type):
|
|
self._type = _type
|
|
|
|
def __eq__(self, other):
|
|
return isinstance(other, self._type)
|
|
|
|
def __repr__(self):
|
|
return f"ANY({self._type.__name__})"
|
|
|
|
|
|
class PipelineTestCaseMeta(type):
|
|
def __new__(mcs, name, bases, dct):
|
|
def gen_test(ModelClass, checkpoint, tiny_config, tokenizer_class, feature_extractor_class):
|
|
@skipIf(tiny_config is None, "TinyConfig does not exist")
|
|
@skipIf(checkpoint is None, "checkpoint does not exist")
|
|
def test(self):
|
|
if ModelClass.__name__.endswith("ForCausalLM"):
|
|
tiny_config.is_encoder_decoder = False
|
|
if hasattr(tiny_config, "encoder_no_repeat_ngram_size"):
|
|
# specific for blenderbot which supports both decoder-only
|
|
# encoder/decoder but the test config only reflects
|
|
# encoder/decoder arch
|
|
tiny_config.encoder_no_repeat_ngram_size = 0
|
|
if ModelClass.__name__.endswith("WithLMHead"):
|
|
tiny_config.is_decoder = True
|
|
try:
|
|
model = ModelClass(tiny_config)
|
|
except ImportError as e:
|
|
self.skipTest(
|
|
f"Cannot run with {tiny_config} as the model requires a library that isn't installed: {e}"
|
|
)
|
|
if hasattr(model, "eval"):
|
|
model = model.eval()
|
|
if tokenizer_class is not None:
|
|
try:
|
|
tokenizer = get_tiny_tokenizer_from_checkpoint(checkpoint)
|
|
# XLNet actually defines it as -1.
|
|
if isinstance(model.config, (RobertaConfig, IBertConfig)):
|
|
tokenizer.model_max_length = model.config.max_position_embeddings - 2
|
|
elif (
|
|
hasattr(model.config, "max_position_embeddings")
|
|
and model.config.max_position_embeddings > 0
|
|
):
|
|
tokenizer.model_max_length = model.config.max_position_embeddings
|
|
# Rust Panic exception are NOT Exception subclass
|
|
# Some test tokenizer contain broken vocabs or custom PreTokenizer, so we
|
|
# provide some default tokenizer and hope for the best.
|
|
except: # noqa: E722
|
|
self.skipTest(f"Ignoring {ModelClass}, cannot create a simple tokenizer")
|
|
else:
|
|
tokenizer = None
|
|
feature_extractor = get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config)
|
|
pipeline, examples = self.get_test_pipeline(model, tokenizer, feature_extractor)
|
|
if pipeline is None:
|
|
# The test can disable itself, but it should be very marginal
|
|
# Concerns: Wav2Vec2ForCTC without tokenizer test (FastTokenizer don't exist)
|
|
return
|
|
self.run_pipeline_test(pipeline, examples)
|
|
|
|
def run_batch_test(pipeline, examples):
|
|
# Need to copy because `Conversation` are stateful
|
|
if pipeline.tokenizer is not None and pipeline.tokenizer.pad_token_id is None:
|
|
return # No batching for this and it's OK
|
|
|
|
# 10 examples with batch size 4 means there needs to be a unfinished batch
|
|
# which is important for the unbatcher
|
|
dataset = [copy.deepcopy(random.choice(examples)) for i in range(10)]
|
|
|
|
for item in pipeline(dataset, batch_size=4):
|
|
pass
|
|
|
|
run_batch_test(pipeline, examples)
|
|
|
|
return test
|
|
|
|
for prefix, key in [("pt", "model_mapping"), ("tf", "tf_model_mapping")]:
|
|
mapping = dct.get(key, {})
|
|
if mapping:
|
|
for configuration, model_architectures in mapping.items():
|
|
if not isinstance(model_architectures, tuple):
|
|
model_architectures = (model_architectures,)
|
|
|
|
for model_architecture in model_architectures:
|
|
checkpoint = get_checkpoint_from_architecture(model_architecture)
|
|
tiny_config = get_tiny_config_from_class(configuration)
|
|
tokenizer_classes = TOKENIZER_MAPPING.get(configuration, [])
|
|
feature_extractor_class = FEATURE_EXTRACTOR_MAPPING.get(configuration, None)
|
|
feature_extractor_name = (
|
|
feature_extractor_class.__name__ if feature_extractor_class else "nofeature_extractor"
|
|
)
|
|
if not tokenizer_classes:
|
|
# We need to test even if there are no tokenizers.
|
|
tokenizer_classes = [None]
|
|
for tokenizer_class in tokenizer_classes:
|
|
if tokenizer_class is not None:
|
|
tokenizer_name = tokenizer_class.__name__
|
|
else:
|
|
tokenizer_name = "notokenizer"
|
|
|
|
test_name = f"test_{prefix}_{configuration.__name__}_{model_architecture.__name__}_{tokenizer_name}_{feature_extractor_name}"
|
|
|
|
if tokenizer_class is not None or feature_extractor_class is not None:
|
|
dct[test_name] = gen_test(
|
|
model_architecture,
|
|
checkpoint,
|
|
tiny_config,
|
|
tokenizer_class,
|
|
feature_extractor_class,
|
|
)
|
|
|
|
@abstractmethod
|
|
def inner(self):
|
|
raise NotImplementedError("Not implemented test")
|
|
|
|
# Force these 2 methods to exist
|
|
dct["test_small_model_pt"] = dct.get("test_small_model_pt", inner)
|
|
dct["test_small_model_tf"] = dct.get("test_small_model_tf", inner)
|
|
|
|
return type.__new__(mcs, name, bases, dct)
|
|
|
|
|
|
@is_pipeline_test
|
|
class CommonPipelineTest(unittest.TestCase):
|
|
@require_torch
|
|
def test_pipeline_iteration(self):
|
|
from torch.utils.data import Dataset
|
|
|
|
class MyDataset(Dataset):
|
|
data = [
|
|
"This is a test",
|
|
"This restaurant is great",
|
|
"This restaurant is awful",
|
|
]
|
|
|
|
def __len__(self):
|
|
return 3
|
|
|
|
def __getitem__(self, i):
|
|
return self.data[i]
|
|
|
|
text_classifier = pipeline(
|
|
task="text-classification", model="Narsil/tiny-distilbert-sequence-classification", framework="pt"
|
|
)
|
|
dataset = MyDataset()
|
|
for output in text_classifier(dataset):
|
|
self.assertEqual(output, {"label": ANY(str), "score": ANY(float)})
|
|
|
|
@require_torch
|
|
def test_check_task_auto_inference(self):
|
|
pipe = pipeline(model="Narsil/tiny-distilbert-sequence-classification")
|
|
|
|
self.assertIsInstance(pipe, TextClassificationPipeline)
|
|
|
|
@require_torch
|
|
def test_pipeline_override(self):
|
|
class MyPipeline(TextClassificationPipeline):
|
|
pass
|
|
|
|
text_classifier = pipeline(model="Narsil/tiny-distilbert-sequence-classification", pipeline_class=MyPipeline)
|
|
|
|
self.assertIsInstance(text_classifier, MyPipeline)
|
|
|
|
def test_check_task(self):
|
|
task = get_task("gpt2")
|
|
self.assertEqual(task, "text-generation")
|
|
|
|
with self.assertRaises(RuntimeError):
|
|
# Wrong framework
|
|
get_task("espnet/siddhana_slurp_entity_asr_train_asr_conformer_raw_en_word_valid.acc.ave_10best")
|
|
|
|
@require_torch
|
|
def test_iterator_data(self):
|
|
def data(n: int):
|
|
for _ in range(n):
|
|
yield "This is a test"
|
|
|
|
pipe = pipeline(model="Narsil/tiny-distilbert-sequence-classification")
|
|
|
|
results = []
|
|
for out in pipe(data(10)):
|
|
self.assertEqual(nested_simplify(out), {"label": "LABEL_1", "score": 0.502})
|
|
results.append(out)
|
|
self.assertEqual(len(results), 10)
|
|
|
|
# When using multiple workers on streamable data it should still work
|
|
# This will force using `num_workers=1` with a warning for now.
|
|
results = []
|
|
for out in pipe(data(10), num_workers=2):
|
|
self.assertEqual(nested_simplify(out), {"label": "LABEL_1", "score": 0.502})
|
|
results.append(out)
|
|
self.assertEqual(len(results), 10)
|
|
|
|
@require_tf
|
|
def test_iterator_data_tf(self):
|
|
def data(n: int):
|
|
for _ in range(n):
|
|
yield "This is a test"
|
|
|
|
pipe = pipeline(model="Narsil/tiny-distilbert-sequence-classification", framework="tf")
|
|
out = pipe("This is a test")
|
|
results = []
|
|
for out in pipe(data(10)):
|
|
self.assertEqual(nested_simplify(out), {"label": "LABEL_1", "score": 0.502})
|
|
results.append(out)
|
|
self.assertEqual(len(results), 10)
|
|
|
|
|
|
@is_pipeline_test
|
|
class PipelinePadTest(unittest.TestCase):
|
|
@require_torch
|
|
def test_pipeline_padding(self):
|
|
import torch
|
|
|
|
items = [
|
|
{
|
|
"label": "label1",
|
|
"input_ids": torch.LongTensor([[1, 23, 24, 2]]),
|
|
"attention_mask": torch.LongTensor([[0, 1, 1, 0]]),
|
|
},
|
|
{
|
|
"label": "label2",
|
|
"input_ids": torch.LongTensor([[1, 23, 24, 43, 44, 2]]),
|
|
"attention_mask": torch.LongTensor([[0, 1, 1, 1, 1, 0]]),
|
|
},
|
|
]
|
|
|
|
self.assertEqual(_pad(items, "label", 0, "right"), ["label1", "label2"])
|
|
self.assertTrue(
|
|
torch.allclose(
|
|
_pad(items, "input_ids", 10, "right"),
|
|
torch.LongTensor([[1, 23, 24, 2, 10, 10], [1, 23, 24, 43, 44, 2]]),
|
|
)
|
|
)
|
|
self.assertTrue(
|
|
torch.allclose(
|
|
_pad(items, "input_ids", 10, "left"),
|
|
torch.LongTensor([[10, 10, 1, 23, 24, 2], [1, 23, 24, 43, 44, 2]]),
|
|
)
|
|
)
|
|
self.assertTrue(
|
|
torch.allclose(
|
|
_pad(items, "attention_mask", 0, "right"), torch.LongTensor([[0, 1, 1, 0, 0, 0], [0, 1, 1, 1, 1, 0]])
|
|
)
|
|
)
|
|
|
|
@require_torch
|
|
def test_pipeline_image_padding(self):
|
|
import torch
|
|
|
|
items = [
|
|
{
|
|
"label": "label1",
|
|
"pixel_values": torch.zeros((1, 3, 10, 10)),
|
|
},
|
|
{
|
|
"label": "label2",
|
|
"pixel_values": torch.zeros((1, 3, 10, 10)),
|
|
},
|
|
]
|
|
|
|
self.assertEqual(_pad(items, "label", 0, "right"), ["label1", "label2"])
|
|
self.assertTrue(
|
|
torch.allclose(
|
|
_pad(items, "pixel_values", 10, "right"),
|
|
torch.zeros((2, 3, 10, 10)),
|
|
)
|
|
)
|
|
|
|
@require_torch
|
|
def test_pipeline_offset_mapping(self):
|
|
import torch
|
|
|
|
items = [
|
|
{
|
|
"offset_mappings": torch.zeros([1, 11, 2], dtype=torch.long),
|
|
},
|
|
{
|
|
"offset_mappings": torch.zeros([1, 4, 2], dtype=torch.long),
|
|
},
|
|
]
|
|
|
|
self.assertTrue(
|
|
torch.allclose(
|
|
_pad(items, "offset_mappings", 0, "right"),
|
|
torch.zeros((2, 11, 2), dtype=torch.long),
|
|
),
|
|
)
|