Merge branch 'main' into add-owlv2-fast-processor

This commit is contained in:
lmarshall12 2025-06-25 19:16:59 +01:00 committed by GitHub
commit be55fff230
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
44 changed files with 21 additions and 2504 deletions

View File

@ -473,13 +473,6 @@ Hier ist zum Beispiel ein Test, der nur ausgeführt werden muss, wenn 2 oder meh
def test_example_with_multi_gpu():
```
Wenn ein Test `tensorflow` benötigt, verwenden Sie den Dekorator `require_tf`. Zum Beispiel:
```python no-style
@require_tf
def test_tf_thing_with_tensorflow():
```
Diese Dekors können gestapelt werden. Wenn zum Beispiel ein Test langsam ist und mindestens eine GPU unter pytorch benötigt, können Sie
wie Sie ihn einrichten können:
@ -1204,9 +1197,6 @@ if torch.cuda.is_available():
import numpy as np
np.random.seed(seed)
# tf RNG
tf.random.set_seed(seed)
```
### Tests debuggen

View File

@ -474,13 +474,6 @@ For example, here is a test that must be run only when there are 2 or more GPUs
def test_example_with_multi_gpu():
```
If a test requires `tensorflow` use the `require_tf` decorator. For example:
```python no-style
@require_tf
def test_tf_thing_with_tensorflow():
```
These decorators can be stacked. For example, if a test is slow and requires at least one GPU under pytorch, here is
how to set it up:
@ -1226,11 +1219,6 @@ if torch.cuda.is_available():
import numpy as np
np.random.seed(seed)
# tf RNG
import tensorflow as tf
tf.random.set_seed(seed)
```
### Debugging tests

View File

@ -445,13 +445,6 @@ CUDA_VISIBLE_DEVICES="1" pytest tests/utils/test_logging.py
def test_example_with_multi_gpu():
```
テストに `tensorflow` が必要な場合は、`require_tf` デコレータを使用します。例えば:
```python no-style
@require_tf
def test_tf_thing_with_tensorflow():
```
これらのデコレータは積み重ねることができます。たとえば、テストが遅く、pytorch で少なくとも 1 つの GPU が必要な場合は、次のようになります。
設定方法:
@ -1135,9 +1128,6 @@ if torch.cuda.is_available():
import numpy as np
np.random.seed(seed)
# tf RNG
tf.random.set_seed(seed)
```

View File

@ -473,13 +473,6 @@ GPU 요구 사항을 표로 정리하면 아래와 같습니디ㅏ:
def test_example_with_multi_gpu():
```
`tensorflow`가 필요한 경우 `require_tf` 데코레이터를 사용합니다. 예를 들어 다음과 같습니다:
```python no-style
@require_tf
def test_tf_thing_with_tensorflow():
```
이러한 데코레이터는 중첩될 수 있습니다.
예를 들어, 느린 테스트로 진행되고 pytorch에서 적어도 하나의 GPU가 필요한 경우 다음과 같이 설정할 수 있습니다:

View File

@ -705,6 +705,9 @@ def require_tf(test_case):
"""
Decorator marking a test that requires TensorFlow. These tests are skipped when TensorFlow isn't installed.
"""
logger.warning_once(
"TensorFlow test-related code, including `require_tf`, is deprecated and will be removed in Transformers v4.55"
)
return unittest.skipUnless(is_tf_available(), "test requires TensorFlow")(test_case)

View File

@ -1,106 +0,0 @@
import unittest
from pathlib import Path
from tempfile import TemporaryDirectory
from transformers import AutoConfig, TFAutoModel, is_tensorflow_text_available, is_tf_available
from transformers.models.bert.tokenization_bert import BertTokenizer
from transformers.testing_utils import require_tensorflow_text, require_tf, slow
if is_tf_available():
import tensorflow as tf
from transformers.modeling_tf_utils import keras
if is_tensorflow_text_available():
from transformers.models.bert import TFBertTokenizer
TOKENIZER_CHECKPOINTS = ["google-bert/bert-base-uncased", "google-bert/bert-base-cased"]
TINY_MODEL_CHECKPOINT = "hf-internal-testing/tiny-bert-tf-only"
if is_tf_available():
from transformers.modeling_tf_utils import keras
class ModelToSave(keras.Model):
def __init__(self, tokenizer):
super().__init__()
self.tokenizer = tokenizer
config = AutoConfig.from_pretrained(TINY_MODEL_CHECKPOINT)
self.bert = TFAutoModel.from_config(config)
def call(self, inputs):
tokenized = self.tokenizer(inputs)
out = self.bert(tokenized)
return out["pooler_output"]
@require_tf
@require_tensorflow_text
class BertTokenizationTest(unittest.TestCase):
# The TF tokenizers are usually going to be used as pretrained tokenizers from existing model checkpoints,
# so that's what we focus on here.
def setUp(self):
super().setUp()
self.tokenizers = [BertTokenizer.from_pretrained(checkpoint) for checkpoint in TOKENIZER_CHECKPOINTS]
self.tf_tokenizers = [TFBertTokenizer.from_pretrained(checkpoint) for checkpoint in TOKENIZER_CHECKPOINTS]
assert len(self.tokenizers) == len(self.tf_tokenizers)
self.test_sentences = [
"This is a straightforward English test sentence.",
"This one has some weird characters\rto\nsee\r\nif those\u00e9break things.",
"Now we're going to add some Chinese: 一 二 三 一二三",
"And some much more rare Chinese: 齉 堃 齉堃",
"Je vais aussi écrire en français pour tester les accents",
"Classical Irish also has some unusual characters, so in they go: Gaelaċ, ꝼ",
]
self.paired_sentences = list(zip(self.test_sentences, self.test_sentences[::-1]))
def test_output_equivalence(self):
for tokenizer, tf_tokenizer in zip(self.tokenizers, self.tf_tokenizers):
for test_inputs in (self.test_sentences, self.paired_sentences):
python_outputs = tokenizer(test_inputs, return_tensors="tf", padding="longest")
tf_outputs = tf_tokenizer(test_inputs)
for key in python_outputs.keys():
self.assertTrue(tf.reduce_all(python_outputs[key].shape == tf_outputs[key].shape))
self.assertTrue(tf.reduce_all(tf.cast(python_outputs[key], tf.int64) == tf_outputs[key]))
@slow
def test_different_pairing_styles(self):
for tf_tokenizer in self.tf_tokenizers:
merged_outputs = tf_tokenizer(self.paired_sentences)
separated_outputs = tf_tokenizer(
text=[sentence[0] for sentence in self.paired_sentences],
text_pair=[sentence[1] for sentence in self.paired_sentences],
)
for key in merged_outputs.keys():
self.assertTrue(tf.reduce_all(tf.cast(merged_outputs[key], tf.int64) == separated_outputs[key]))
@slow
def test_graph_mode(self):
for tf_tokenizer in self.tf_tokenizers:
compiled_tokenizer = tf.function(tf_tokenizer)
for test_inputs in (self.test_sentences, self.paired_sentences):
test_inputs = tf.constant(test_inputs)
compiled_outputs = compiled_tokenizer(test_inputs)
eager_outputs = tf_tokenizer(test_inputs)
for key in eager_outputs.keys():
self.assertTrue(tf.reduce_all(eager_outputs[key] == compiled_outputs[key]))
@slow
def test_export_for_inference(self):
for tf_tokenizer in self.tf_tokenizers:
model = ModelToSave(tokenizer=tf_tokenizer)
test_inputs = tf.convert_to_tensor(self.test_sentences)
out = model(test_inputs) # Build model with some sample inputs
with TemporaryDirectory() as tempdir:
save_path = Path(tempdir) / "saved.model"
model.export(save_path)
loaded_model = tf.saved_model.load(save_path)
loaded_output = loaded_model.serve(test_inputs)
# We may see small differences because the loaded model is compiled, so we need an epsilon for the test
self.assertLessEqual(tf.reduce_max(tf.abs(out - loaded_output)), 1e-5)

View File

@ -1,131 +0,0 @@
import unittest
from pathlib import Path
from tempfile import TemporaryDirectory
from transformers import AutoConfig, TFGPT2LMHeadModel, is_keras_nlp_available, is_tf_available
from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
from transformers.testing_utils import require_keras_nlp, require_tf, slow
if is_tf_available():
import tensorflow as tf
if is_keras_nlp_available():
from transformers.models.gpt2 import TFGPT2Tokenizer
TOKENIZER_CHECKPOINTS = ["openai-community/gpt2"]
TINY_MODEL_CHECKPOINT = "openai-community/gpt2"
if is_tf_available():
class ModelToSave(tf.Module):
def __init__(self, tokenizer):
super().__init__()
self.tokenizer = tokenizer
config = AutoConfig.from_pretrained(TINY_MODEL_CHECKPOINT)
self.model = TFGPT2LMHeadModel.from_config(config)
@tf.function(input_signature=(tf.TensorSpec((None,), tf.string, name="text"),))
def serving(self, text):
tokenized = self.tokenizer(text)
input_ids_dense = tokenized["input_ids"].to_tensor()
input_mask = tf.cast(input_ids_dense > 0, tf.int32)
# input_mask = tf.reshape(input_mask, [-1, MAX_SEQ_LEN])
outputs = self.model(input_ids=input_ids_dense, attention_mask=input_mask)["logits"]
return outputs
@require_tf
@require_keras_nlp
class GPTTokenizationTest(unittest.TestCase):
# The TF tokenizers are usually going to be used as pretrained tokenizers from existing model checkpoints,
# so that's what we focus on here.
def setUp(self):
super().setUp()
self.tokenizers = [GPT2Tokenizer.from_pretrained(checkpoint) for checkpoint in (TOKENIZER_CHECKPOINTS)]
self.tf_tokenizers = [TFGPT2Tokenizer.from_pretrained(checkpoint) for checkpoint in TOKENIZER_CHECKPOINTS]
assert len(self.tokenizers) == len(self.tf_tokenizers)
self.test_sentences = [
"This is a straightforward English test sentence.",
"This one has some weird characters\rto\nsee\r\nif those\u00e9break things.",
"Now we're going to add some Chinese: 一 二 三 一二三",
"And some much more rare Chinese: 齉 堃 齉堃",
"Je vais aussi écrire en français pour tester les accents",
"Classical Irish also has some unusual characters, so in they go: Gaelaċ, ꝼ",
]
self.paired_sentences = list(zip(self.test_sentences, self.test_sentences[::-1]))
def test_output_equivalence(self):
for tokenizer, tf_tokenizer in zip(self.tokenizers, self.tf_tokenizers):
for test_inputs in self.test_sentences:
python_outputs = tokenizer([test_inputs], return_tensors="tf")
tf_outputs = tf_tokenizer([test_inputs])
for key in python_outputs.keys():
# convert them to numpy to avoid messing with ragged tensors
python_outputs_values = python_outputs[key].numpy()
tf_outputs_values = tf_outputs[key].numpy()
self.assertTrue(tf.reduce_all(python_outputs_values.shape == tf_outputs_values.shape))
self.assertTrue(tf.reduce_all(tf.cast(python_outputs_values, tf.int64) == tf_outputs_values))
@slow
def test_graph_mode(self):
for tf_tokenizer in self.tf_tokenizers:
compiled_tokenizer = tf.function(tf_tokenizer)
for test_inputs in self.test_sentences:
test_inputs = tf.constant(test_inputs)
compiled_outputs = compiled_tokenizer(test_inputs)
eager_outputs = tf_tokenizer(test_inputs)
for key in eager_outputs.keys():
self.assertTrue(tf.reduce_all(eager_outputs[key] == compiled_outputs[key]))
@slow
def test_saved_model(self):
for tf_tokenizer in self.tf_tokenizers:
model = ModelToSave(tokenizer=tf_tokenizer)
test_inputs = tf.convert_to_tensor([self.test_sentences[0]])
out = model.serving(test_inputs) # Build model with some sample inputs
with TemporaryDirectory() as tempdir:
save_path = Path(tempdir) / "saved.model"
tf.saved_model.save(model, save_path, signatures={"serving_default": model.serving})
loaded_model = tf.saved_model.load(save_path)
loaded_output = loaded_model.signatures["serving_default"](test_inputs)["output_0"]
# We may see small differences because the loaded model is compiled, so we need an epsilon for the test
self.assertTrue(tf.reduce_all(out == loaded_output))
@slow
def test_from_config(self):
for tf_tokenizer in self.tf_tokenizers:
test_inputs = tf.convert_to_tensor([self.test_sentences[0]])
out = tf_tokenizer(test_inputs) # Build model with some sample inputs
config = tf_tokenizer.get_config()
model_from_config = TFGPT2Tokenizer.from_config(config)
from_config_output = model_from_config(test_inputs)
for key in from_config_output.keys():
self.assertTrue(tf.reduce_all(from_config_output[key] == out[key]))
@slow
def test_padding(self):
for tf_tokenizer in self.tf_tokenizers:
# for the test to run
tf_tokenizer.pad_token_id = 123123
for max_length in [3, 5, 1024]:
test_inputs = tf.convert_to_tensor([self.test_sentences[0]])
out = tf_tokenizer(test_inputs, max_length=max_length)
out_length = out["input_ids"].numpy().shape[1]
assert out_length == max_length

View File

@ -34,7 +34,6 @@ from transformers import (
from transformers.models.layoutlmv3.tokenization_layoutlmv3 import VOCAB_FILES_NAMES, LayoutLMv3Tokenizer
from transformers.testing_utils import (
require_pandas,
require_tf,
require_tokenizers,
require_torch,
slow,
@ -2306,42 +2305,6 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_np_encode_plus_sent_to_model(self):
pass
@require_tf
@slow
def test_tf_encode_plus_sent_to_model(self):
from transformers import TF_MODEL_MAPPING, TOKENIZER_MAPPING
MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(TF_MODEL_MAPPING, TOKENIZER_MAPPING)
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
self.skipTest(f"{tokenizer.__class__} is not in the MODEL_TOKENIZER_MAPPING")
config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
config = config_class()
if config.is_encoder_decoder or config.pad_token_id is None:
self.skipTest(reason="Model is an encoder-decoder or has no pad token id set.")
model = model_class(config)
# Make sure the model contains at least the full vocabulary size in its embedding matrix
self.assertGreaterEqual(model.config.vocab_size, len(tokenizer))
# Build sequence
first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
boxes = [[1000, 1000, 1000, 1000] for _ in range(len(first_ten_tokens))]
encoded_sequence = tokenizer.encode_plus(first_ten_tokens, boxes=boxes, return_tensors="tf")
batch_encoded_sequence = tokenizer.batch_encode_plus(
[first_ten_tokens, first_ten_tokens], boxes=[boxes, boxes], return_tensors="tf"
)
# This should not fail
model(encoded_sequence)
model(batch_encoded_sequence)
@unittest.skip(reason="Chat is not supported")
def test_chat_template(self):
pass

View File

@ -24,7 +24,6 @@ from transformers.testing_utils import (
require_essentia,
require_librosa,
require_scipy,
require_tf,
require_torch,
)
from transformers.utils.import_utils import (
@ -231,28 +230,6 @@ class Pop2PianoFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittes
# check shape
self.assertEqual(len(input_features["input_features"].shape), 3)
@require_tf
def test_batch_feature_tf(self):
import tensorflow as tf
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
speech_input1 = np.zeros([1_000_000], dtype=np.float32)
speech_input2 = np.ones([2_000_000], dtype=np.float32)
speech_input3 = np.random.randint(low=0, high=10, size=500_000).astype(np.float32)
input_features = feature_extractor(
[speech_input1, speech_input2, speech_input3],
sampling_rate=[44_100, 16_000, 48_000],
return_tensors="tf",
return_attention_mask=True,
)
# check tf tensor or not
self.assertTrue(tf.is_tensor(input_features["input_features"]))
# check shape
self.assertEqual(len(input_features["input_features"].shape), 3)
@unittest.skip(
"Pop2PianoFeatureExtractor does not supports padding externally (while processing audios in batches padding is automatically applied to max_length)"
)

View File

@ -17,15 +17,10 @@ import unittest
import numpy as np
from transformers.testing_utils import (
require_tf,
require_torch,
require_torchvision,
require_vision,
)
from transformers.utils import is_tf_available, is_torch_available, is_vision_available
from transformers.testing_utils import require_torch, require_torchvision, require_vision
from transformers.utils import is_torch_available, is_vision_available
from ...test_processing_common import ProcessorTesterMixin, prepare_image_inputs
from ...test_processing_common import ProcessorTesterMixin
if is_vision_available():
@ -38,11 +33,6 @@ if is_torch_available():
from transformers.models.sam.image_processing_sam import _mask_to_rle_pytorch
if is_tf_available():
import tensorflow as tf
from transformers.models.sam.image_processing_sam import _mask_to_rle_tf
@require_vision
@require_torchvision
@ -202,143 +192,3 @@ class SamProcessorTest(ProcessorTesterMixin, unittest.TestCase):
self.assertEqual(len(rle), 1)
self.assertEqual(rle[0]["size"], [2, 2])
self.assertEqual(rle[0]["counts"], [1, 3]) # 1 zero, followed by 3 ones
@require_vision
@require_tf
class TFSamProcessorTest(unittest.TestCase):
def setUp(self):
self.tmpdirname = tempfile.mkdtemp()
image_processor = SamImageProcessor()
processor = SamProcessor(image_processor)
processor.save_pretrained(self.tmpdirname)
def get_image_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
def tearDown(self):
shutil.rmtree(self.tmpdirname)
# This is to avoid repeating the skipping of the common tests
def prepare_image_inputs(self):
"""This function prepares a list of PIL images."""
return prepare_image_inputs()
def test_save_load_pretrained_additional_features(self):
processor = SamProcessor(image_processor=self.get_image_processor())
processor.save_pretrained(self.tmpdirname)
image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
processor = SamProcessor.from_pretrained(self.tmpdirname, do_normalize=False, padding_value=1.0)
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
self.assertIsInstance(processor.image_processor, SamImageProcessor)
def test_image_processor(self):
image_processor = self.get_image_processor()
processor = SamProcessor(image_processor=image_processor)
image_input = self.prepare_image_inputs()
input_feat_extract = image_processor(image_input, return_tensors="np")
input_processor = processor(images=image_input, return_tensors="np")
input_feat_extract.pop("original_sizes") # pop original_sizes as it is popped in the processor
input_feat_extract.pop("reshaped_input_sizes") # pop reshaped_input_sizes as it is popped in the processor
for key in input_feat_extract.keys():
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
@require_tf
def test_post_process_masks(self):
image_processor = self.get_image_processor()
processor = SamProcessor(image_processor=image_processor)
dummy_masks = [tf.ones((1, 3, 5, 5))]
original_sizes = [[1764, 2646]]
reshaped_input_size = [[683, 1024]]
masks = processor.post_process_masks(dummy_masks, original_sizes, reshaped_input_size, return_tensors="tf")
self.assertEqual(masks[0].shape, (1, 3, 1764, 2646))
masks = processor.post_process_masks(
dummy_masks,
tf.convert_to_tensor(original_sizes),
tf.convert_to_tensor(reshaped_input_size),
return_tensors="tf",
)
self.assertEqual(masks[0].shape, (1, 3, 1764, 2646))
# should also work with np
dummy_masks = [np.ones((1, 3, 5, 5))]
masks = processor.post_process_masks(
dummy_masks, np.array(original_sizes), np.array(reshaped_input_size), return_tensors="tf"
)
self.assertEqual(masks[0].shape, (1, 3, 1764, 2646))
dummy_masks = [[1, 0], [0, 1]]
with self.assertRaises(tf.errors.InvalidArgumentError):
masks = processor.post_process_masks(
dummy_masks, np.array(original_sizes), np.array(reshaped_input_size), return_tensors="tf"
)
def test_rle_encoding(self):
"""
Test the run-length encoding function.
"""
# Test that a mask of all zeros returns a single run [height * width].
input_mask = tf.zeros((1, 2, 2), dtype=tf.int64) # shape: 1 x 2 x 2
rle = _mask_to_rle_tf(input_mask)
self.assertEqual(len(rle), 1)
self.assertEqual(rle[0]["size"], [2, 2])
# For a 2x2 all-zero mask, we expect a single run of length 4:
self.assertEqual(rle[0]["counts"], [4])
# Test that a mask of all ones returns [0, height * width].
input_mask = tf.ones((1, 2, 2), dtype=tf.int64) # shape: 1 x 2 x 2
rle = _mask_to_rle_tf(input_mask)
self.assertEqual(len(rle), 1)
self.assertEqual(rle[0]["size"], [2, 2])
# For a 2x2 all-one mask, we expect two runs: [0, 4].
self.assertEqual(rle[0]["counts"], [0, 4])
# Test a mask with mixed 0s and 1s to ensure the run-length encoding is correct.
# Example mask:
# Row 0: [0, 1]
# Row 1: [1, 1]
# This is shape (1, 2, 2).
# Flattened in Fortran order -> [0, 1, 1, 1].
# The RLE for [0,1,1,1] is [1, 3].
input_mask = tf.constant([[[0, 1], [1, 1]]], dtype=tf.int64)
rle = _mask_to_rle_tf(input_mask)
self.assertEqual(len(rle), 1)
self.assertEqual(rle[0]["size"], [2, 2])
self.assertEqual(rle[0]["counts"], [1, 3]) # 1 zero, followed by 3 ones
@require_vision
@require_torchvision
class SamProcessorEquivalenceTest(unittest.TestCase):
def setUp(self):
self.tmpdirname = tempfile.mkdtemp()
image_processor = SamImageProcessor()
processor = SamProcessor(image_processor)
processor.save_pretrained(self.tmpdirname)
def get_image_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
def tearDown(self):
shutil.rmtree(self.tmpdirname)
# This is to avoid repeating the skipping of the common tests
def prepare_image_inputs(self):
"""This function prepares a list of PIL images."""
return prepare_image_inputs()

View File

@ -18,7 +18,7 @@ import numpy as np
from transformers.models.whisper import WhisperTokenizer, WhisperTokenizerFast
from transformers.models.whisper.tokenization_whisper import _combine_tokens_into_words, _find_longest_common_sequence
from transformers.testing_utils import require_flax, require_tf, require_torch, slow
from transformers.testing_utils import require_flax, require_torch, slow
from ...test_tokenization_common import TokenizerTesterMixin
@ -588,15 +588,6 @@ class SpeechToTextTokenizerMultilinguialTest(unittest.TestCase):
self.assertListEqual(WhisperTokenizer._convert_to_list(np_array), test_list)
self.assertListEqual(WhisperTokenizerFast._convert_to_list(np_array), test_list)
@require_tf
def test_convert_to_list_tf(self):
import tensorflow as tf
test_list = [[1, 2, 3], [4, 5, 6]]
tf_tensor = tf.constant(test_list)
self.assertListEqual(WhisperTokenizer._convert_to_list(tf_tensor), test_list)
self.assertListEqual(WhisperTokenizerFast._convert_to_list(tf_tensor), test_list)
@require_flax
def test_convert_to_list_jax(self):
import jax.numpy as jnp

View File

@ -1,100 +0,0 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import is_tf_available
from transformers.testing_utils import require_tf
if is_tf_available():
import tensorflow as tf
from tensorflow.python.eager import context
from tensorflow.python.framework import ops
from transformers import GradientAccumulator, create_optimizer
@require_tf
class OptimizationFTest(unittest.TestCase):
def assertListAlmostEqual(self, list1, list2, tol):
self.assertEqual(len(list1), len(list2))
for a, b in zip(list1, list2):
self.assertAlmostEqual(a, b, delta=tol)
def testGradientAccumulator(self):
accumulator = GradientAccumulator()
accumulator([tf.constant([1.0, 2.0])])
accumulator([tf.constant([-2.0, 1.0])])
accumulator([tf.constant([-1.0, 2.0])])
with self.assertRaises(ValueError):
accumulator([tf.constant([1.0, 1.0]), tf.constant([2.0, 2.0])])
self.assertEqual(accumulator.step, 3)
self.assertEqual(len(accumulator.gradients), 1)
self.assertListAlmostEqual(accumulator.gradients[0].numpy().tolist(), [-2.0, 5.0], tol=1e-2)
accumulator.reset()
self.assertEqual(accumulator.step, 0)
self.assertListAlmostEqual(accumulator.gradients[0].numpy().tolist(), [0.0, 0.0], tol=1e-2)
def testGradientAccumulatorDistributionStrategy(self):
context._context = None
ops.enable_eager_execution_internal()
physical_devices = tf.config.list_physical_devices("CPU")
if len(physical_devices) == 1:
tf.config.set_logical_device_configuration(
physical_devices[0], [tf.config.LogicalDeviceConfiguration(), tf.config.LogicalDeviceConfiguration()]
)
devices = tf.config.list_logical_devices(device_type="CPU")
strategy = tf.distribute.MirroredStrategy(devices=devices[:2])
with strategy.scope():
accumulator = GradientAccumulator()
variable = tf.Variable([4.0, 3.0])
optimizer, _ = create_optimizer(5e-5, 10, 5)
gradient_placeholder = tf.Variable([0.0, 0.0], trainable=False)
def accumulate_on_replica(gradient):
accumulator([gradient])
def apply_on_replica():
optimizer.apply_gradients(list(zip(accumulator.gradients, [variable])))
@tf.function
def accumulate(grad1, grad2):
with strategy.scope():
local_variables = strategy.experimental_local_results(gradient_placeholder)
local_variables[0].assign(grad1)
local_variables[1].assign(grad2)
strategy.run(accumulate_on_replica, args=(gradient_placeholder,))
@tf.function
def apply_grad():
with strategy.scope():
strategy.run(apply_on_replica)
def _check_local_values(grad1, grad2):
values = strategy.experimental_local_results(accumulator._gradients[0])
self.assertListAlmostEqual(values[0].value(), grad1, tol=1e-2)
self.assertListAlmostEqual(values[1].value(), grad2, tol=1e-2)
accumulate([1.0, 2.0], [-1.0, 1.0])
accumulate([3.0, -1.0], [-1.0, -1.0])
accumulate([-2.0, 2.0], [3.0, -2.0])
self.assertEqual(accumulator.step, 3)
_check_local_values([2.0, 3.0], [1.0, -2.0])
apply_grad()
self.assertListAlmostEqual(variable.value(), [4.0, 3.0], tol=1e-2)
accumulator.reset()
self.assertEqual(accumulator.step, 0)
_check_local_values([0.0, 0.0], [0.0, 0.0])

View File

@ -28,7 +28,6 @@ from transformers.testing_utils import (
compare_pipeline_output_to_hub_spec,
is_pipeline_test,
nested_simplify,
require_tf,
require_torch,
require_torchaudio,
slow,
@ -193,11 +192,6 @@ class AudioClassificationPipelineTests(unittest.TestCase):
],
)
@require_tf
@unittest.skip(reason="Audio classification is not implemented for TF")
def test_small_model_tf(self):
pass
@require_torch
@slow
def test_top_k_none_returns_all_labels(self):

View File

@ -40,7 +40,6 @@ from transformers.testing_utils import (
is_torch_available,
nested_simplify,
require_pyctcdecode,
require_tf,
require_torch,
require_torch_accelerator,
require_torchaudio,
@ -326,10 +325,6 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
):
_ = speech_recognizer(filename, return_timestamps="char")
@require_tf
def test_small_model_tf(self):
self.skipTest(reason="Tensorflow not supported yet.")
@require_torch
@unittest.skip("TODO (joao, eustache): this test is failing, find the breaking PR and fix the cause or the test")
def test_torch_small_no_tokenizer_files(self):

View File

@ -48,8 +48,6 @@ from transformers.testing_utils import (
is_pipeline_test,
is_staging_test,
nested_simplify,
require_tensorflow_probability,
require_tf,
require_torch,
require_torch_accelerator,
require_torch_multi_accelerator,
@ -177,20 +175,6 @@ class CommonPipelineTest(unittest.TestCase):
results.append(out)
self.assertEqual(len(results), 10)
@require_tf
def test_iterator_data_tf(self):
def data(n: int):
for _ in range(n):
yield "This is a test"
pipe = pipeline(model="hf-internal-testing/tiny-random-distilbert", framework="tf")
out = pipe("This is a test")
results = []
for out in pipe(data(10)):
self.assertEqual(nested_simplify(out), {"label": "LABEL_0", "score": 0.504})
results.append(out)
self.assertEqual(len(results), 10)
@require_torch
def test_unbatch_attentions_hidden_states(self):
model = DistilBertForSequenceClassification.from_pretrained(
@ -262,9 +246,9 @@ class CommonPipelineTest(unittest.TestCase):
@is_pipeline_test
@require_torch
class PipelineScikitCompatTest(unittest.TestCase):
@require_torch
def test_pipeline_predict_pt(self):
def test_pipeline_predict(self):
data = ["This is a test"]
text_classifier = pipeline(
@ -275,20 +259,7 @@ class PipelineScikitCompatTest(unittest.TestCase):
actual_output = text_classifier.predict(data)
self.assertEqual(expected_output, actual_output)
@require_tf
def test_pipeline_predict_tf(self):
data = ["This is a test"]
text_classifier = pipeline(
task="text-classification", model="hf-internal-testing/tiny-random-distilbert", framework="tf"
)
expected_output = [{"label": ANY(str), "score": ANY(float)}]
actual_output = text_classifier.predict(data)
self.assertEqual(expected_output, actual_output)
@require_torch
def test_pipeline_transform_pt(self):
def test_pipeline_transform(self):
data = ["This is a test"]
text_classifier = pipeline(
@ -299,18 +270,6 @@ class PipelineScikitCompatTest(unittest.TestCase):
actual_output = text_classifier.transform(data)
self.assertEqual(expected_output, actual_output)
@require_tf
def test_pipeline_transform_tf(self):
data = ["This is a test"]
text_classifier = pipeline(
task="text-classification", model="hf-internal-testing/tiny-random-distilbert", framework="tf"
)
expected_output = [{"label": ANY(str), "score": ANY(float)}]
actual_output = text_classifier.transform(data)
self.assertEqual(expected_output, actual_output)
@is_pipeline_test
class PipelinePadTest(unittest.TestCase):
@ -620,23 +579,6 @@ class PipelineUtilsTest(unittest.TestCase):
gc.collect()
backend_empty_cache(torch_device)
@slow
@require_tf
def test_load_default_pipelines_tf(self):
from transformers.modeling_tf_utils import keras
from transformers.pipelines import SUPPORTED_TASKS
set_seed_fn = lambda: keras.utils.set_random_seed(0) # noqa: E731
for task in SUPPORTED_TASKS.keys():
if task == "table-question-answering":
# test table in separate test due to more dependencies
continue
self.check_default_pipeline(task, "tf", set_seed_fn, self.check_models_equal_tf)
# clean-up as much as possible GPU memory occupied by TF
gc.collect()
@slow
@require_torch
def test_load_default_pipelines_pt_table_qa(self):
@ -663,18 +605,6 @@ class PipelineUtilsTest(unittest.TestCase):
pipe = pipeline("text-generation", device=torch_device)
_ = pipe("Hello")
@slow
@require_tf
@require_tensorflow_probability
def test_load_default_pipelines_tf_table_qa(self):
import tensorflow as tf
set_seed_fn = lambda: tf.random.set_seed(0) # noqa: E731
self.check_default_pipeline("table-question-answering", "tf", set_seed_fn, self.check_models_equal_tf)
# clean-up as much as possible GPU memory occupied by PyTorch
gc.collect()
def check_default_pipeline(self, task, framework, set_seed_fn, check_models_equal_fn):
from transformers.pipelines import SUPPORTED_TASKS, pipeline

View File

@ -24,7 +24,6 @@ from transformers.testing_utils import (
compare_pipeline_output_to_hub_spec,
is_pipeline_test,
nested_simplify,
require_tf,
require_timm,
require_torch,
require_vision,
@ -123,11 +122,6 @@ class DepthEstimationPipelineTests(unittest.TestCase):
for single_output in outputs:
compare_pipeline_output_to_hub_spec(single_output, DepthEstimationOutput)
@require_tf
@unittest.skip(reason="Depth estimation is not implemented in TF")
def test_small_model_tf(self):
pass
@slow
@require_torch
def test_large_model_pt(self):

View File

@ -27,7 +27,6 @@ from transformers.testing_utils import (
nested_simplify,
require_detectron2,
require_pytesseract,
require_tf,
require_torch,
require_torch_bf16,
require_vision,
@ -423,8 +422,3 @@ class DocumentQuestionAnsweringPipelineTests(unittest.TestCase):
question = "What is the invoice number?"
outputs = dqa_pipeline(image=image, question=question, top_k=2)
self.assertEqual(nested_simplify(outputs, decimals=4), [{"answer": "us-001"}])
@require_tf
@unittest.skip(reason="Document question answering not implemented in TF")
def test_small_model_tf(self):
pass

View File

@ -23,19 +23,15 @@ from transformers import (
TF_MODEL_MAPPING,
FeatureExtractionPipeline,
LxmertConfig,
is_tf_available,
is_torch_available,
pipeline,
)
from transformers.testing_utils import is_pipeline_test, nested_simplify, require_tf, require_torch
from transformers.testing_utils import is_pipeline_test, nested_simplify, require_torch
if is_torch_available():
import torch
if is_tf_available():
import tensorflow as tf
@is_pipeline_test
class FeatureExtractionPipelineTests(unittest.TestCase):
@ -52,16 +48,6 @@ class FeatureExtractionPipelineTests(unittest.TestCase):
nested_simplify(outputs),
[[[2.287, 1.234, 0.042, 1.53, 1.306, 0.879, -0.526, -1.71, -1.276, 0.756, -0.775, -1.048, -0.25, -0.595, -0.137, -0.598, 2.022, -0.812, 0.284, -0.488, -0.391, -0.403, -0.525, -0.061, -0.228, 1.086, 0.378, -0.14, 0.599, -0.087, -2.259, -0.098], [1.676, 0.232, -1.508, -0.145, 1.798, -1.388, 1.331, -0.37, -0.939, 0.043, 0.06, -0.414, -1.408, 0.24, 0.622, -0.55, -0.569, 1.873, -0.706, 1.924, -0.254, 1.927, -0.423, 0.152, -0.952, 0.509, -0.496, -0.968, 0.093, -1.049, -0.65, 0.312], [0.207, -0.775, -1.822, 0.321, -0.71, -0.201, 0.3, 1.146, -0.233, -0.753, -0.305, 1.309, -1.47, -0.21, 1.802, -1.555, -1.175, 1.323, -0.303, 0.722, -0.076, 0.103, -1.406, 1.931, 0.091, 0.237, 1.172, 1.607, 0.253, -0.9, -1.068, 0.438], [0.615, 1.077, 0.171, -0.175, 1.3, 0.901, -0.653, -0.138, 0.341, -0.654, -0.184, -0.441, -0.424, 0.356, -0.075, 0.26, -1.023, 0.814, 0.524, -0.904, -0.204, -0.623, 1.234, -1.03, 2.594, 0.56, 1.831, -0.199, -1.508, -0.492, -1.687, -2.165], [0.129, 0.008, -1.279, -0.412, -0.004, 1.663, 0.196, 0.104, 0.123, 0.119, 0.635, 1.757, 2.334, -0.799, -1.626, -1.26, 0.595, -0.316, -1.399, 0.232, 0.264, 1.386, -1.171, -0.256, -0.256, -1.944, 1.168, -0.368, -0.714, -0.51, 0.454, 1.148], [-0.32, 0.29, -1.309, -0.177, 0.453, 0.636, -0.024, 0.509, 0.931, -1.754, -1.575, 0.786, 0.046, -1.165, -1.416, 1.373, 1.293, -0.285, -1.541, -1.186, -0.106, -0.994, 2.001, 0.972, -0.02, 1.654, -0.236, 0.643, 1.02, 0.572, -0.914, -0.154], [0.7, -0.937, 0.441, 0.25, 0.78, -0.022, 0.282, -0.095, 1.558, -0.336, 1.706, 0.884, 1.28, 0.198, -0.796, 1.218, -1.769, 1.197, -0.342, -0.177, -0.645, 1.364, 0.008, -0.597, -0.484, -2.772, -0.696, -0.632, -0.34, -1.527, -0.562, 0.862], [2.504, 0.831, -1.271, -0.033, 0.298, -0.735, 1.339, 1.74, 0.233, -1.424, -0.819, -0.761, 0.291, 0.853, -0.092, -0.885, 0.164, 1.025, 0.907, 0.749, -1.515, -0.545, -1.365, 0.271, 0.034, -2.005, 0.031, 0.244, 0.621, 0.176, 0.336, -1.196], [-0.711, 0.591, -1.001, -0.946, 0.784, -1.66, 1.545, 0.799, -0.857, 1.148, 0.213, -0.285, 0.464, -0.139, 0.79, -1.663, -1.121, 0.575, -0.178, -0.508, 1.565, -0.242, -0.346, 1.024, -1.135, -0.158, -2.101, 0.275, 2.009, -0.425, 0.716, 0.981], [0.912, -1.186, -0.846, -0.421, -1.315, -0.827, 0.309, 0.533, 1.029, -2.343, 1.513, -1.238, 1.487, -0.849, 0.896, -0.927, -0.459, 0.159, 0.177, 0.873, 0.935, 1.433, -0.485, 0.737, 1.327, -0.338, 1.608, -0.47, -0.445, -1.118, -0.213, -0.446], [-0.434, -1.362, -1.098, -1.068, 1.507, 0.003, 0.413, -0.395, 0.897, -0.237, 1.405, -0.344, 1.693, 0.677, 0.097, -0.257, -0.602, 1.026, -1.229, 0.855, -0.713, 1.014, 0.443, 0.238, 0.425, -2.184, 1.933, -1.157, -1.132, -0.597, -0.785, 0.967], [0.58, -0.971, 0.789, -0.468, -0.576, 1.779, 1.747, 1.715, -1.939, 0.125, 0.656, -0.042, -1.024, -1.767, 0.107, -0.408, -0.866, -1.774, 1.248, 0.939, -0.033, 1.523, 1.168, -0.744, 0.209, -0.168, -0.316, 0.207, -0.432, 0.047, -0.646, -0.664], [-0.185, -0.613, -1.695, 1.602, -0.32, -0.277, 0.967, 0.728, -0.965, -0.234, 1.069, -0.63, -1.631, 0.711, 0.426, 1.298, -0.191, -0.467, -0.771, 0.971, -0.118, -1.577, -2.064, -0.055, -0.59, 0.642, -0.997, 1.251, 0.538, 1.367, 0.106, 1.704]]]) # fmt: skip
@require_tf
def test_small_model_tf(self):
feature_extractor = pipeline(
task="feature-extraction", model="hf-internal-testing/tiny-random-distilbert", framework="tf"
)
outputs = feature_extractor("This is a test")
self.assertEqual(
nested_simplify(outputs),
[[[2.287, 1.234, 0.042, 1.53, 1.306, 0.879, -0.526, -1.71, -1.276, 0.756, -0.775, -1.048, -0.25, -0.595, -0.137, -0.598, 2.022, -0.812, 0.284, -0.488, -0.391, -0.403, -0.525, -0.061, -0.228, 1.086, 0.378, -0.14, 0.599, -0.087, -2.259, -0.098], [1.676, 0.232, -1.508, -0.145, 1.798, -1.388, 1.331, -0.37, -0.939, 0.043, 0.06, -0.414, -1.408, 0.24, 0.622, -0.55, -0.569, 1.873, -0.706, 1.924, -0.254, 1.927, -0.423, 0.152, -0.952, 0.509, -0.496, -0.968, 0.093, -1.049, -0.65, 0.312], [0.207, -0.775, -1.822, 0.321, -0.71, -0.201, 0.3, 1.146, -0.233, -0.753, -0.305, 1.309, -1.47, -0.21, 1.802, -1.555, -1.175, 1.323, -0.303, 0.722, -0.076, 0.103, -1.406, 1.931, 0.091, 0.237, 1.172, 1.607, 0.253, -0.9, -1.068, 0.438], [0.615, 1.077, 0.171, -0.175, 1.3, 0.901, -0.653, -0.138, 0.341, -0.654, -0.184, -0.441, -0.424, 0.356, -0.075, 0.26, -1.023, 0.814, 0.524, -0.904, -0.204, -0.623, 1.234, -1.03, 2.594, 0.56, 1.831, -0.199, -1.508, -0.492, -1.687, -2.165], [0.129, 0.008, -1.279, -0.412, -0.004, 1.663, 0.196, 0.104, 0.123, 0.119, 0.635, 1.757, 2.334, -0.799, -1.626, -1.26, 0.595, -0.316, -1.399, 0.232, 0.264, 1.386, -1.171, -0.256, -0.256, -1.944, 1.168, -0.368, -0.714, -0.51, 0.454, 1.148], [-0.32, 0.29, -1.309, -0.177, 0.453, 0.636, -0.024, 0.509, 0.931, -1.754, -1.575, 0.786, 0.046, -1.165, -1.416, 1.373, 1.293, -0.285, -1.541, -1.186, -0.106, -0.994, 2.001, 0.972, -0.02, 1.654, -0.236, 0.643, 1.02, 0.572, -0.914, -0.154], [0.7, -0.937, 0.441, 0.25, 0.78, -0.022, 0.282, -0.095, 1.558, -0.336, 1.706, 0.884, 1.28, 0.198, -0.796, 1.218, -1.769, 1.197, -0.342, -0.177, -0.645, 1.364, 0.008, -0.597, -0.484, -2.772, -0.696, -0.632, -0.34, -1.527, -0.562, 0.862], [2.504, 0.831, -1.271, -0.033, 0.298, -0.735, 1.339, 1.74, 0.233, -1.424, -0.819, -0.761, 0.291, 0.853, -0.092, -0.885, 0.164, 1.025, 0.907, 0.749, -1.515, -0.545, -1.365, 0.271, 0.034, -2.005, 0.031, 0.244, 0.621, 0.176, 0.336, -1.196], [-0.711, 0.591, -1.001, -0.946, 0.784, -1.66, 1.545, 0.799, -0.857, 1.148, 0.213, -0.285, 0.464, -0.139, 0.79, -1.663, -1.121, 0.575, -0.178, -0.508, 1.565, -0.242, -0.346, 1.024, -1.135, -0.158, -2.101, 0.275, 2.009, -0.425, 0.716, 0.981], [0.912, -1.186, -0.846, -0.421, -1.315, -0.827, 0.309, 0.533, 1.029, -2.343, 1.513, -1.238, 1.487, -0.849, 0.896, -0.927, -0.459, 0.159, 0.177, 0.873, 0.935, 1.433, -0.485, 0.737, 1.327, -0.338, 1.608, -0.47, -0.445, -1.118, -0.213, -0.446], [-0.434, -1.362, -1.098, -1.068, 1.507, 0.003, 0.413, -0.395, 0.897, -0.237, 1.405, -0.344, 1.693, 0.677, 0.097, -0.257, -0.602, 1.026, -1.229, 0.855, -0.713, 1.014, 0.443, 0.238, 0.425, -2.184, 1.933, -1.157, -1.132, -0.597, -0.785, 0.967], [0.58, -0.971, 0.789, -0.468, -0.576, 1.779, 1.747, 1.715, -1.939, 0.125, 0.656, -0.042, -1.024, -1.767, 0.107, -0.408, -0.866, -1.774, 1.248, 0.939, -0.033, 1.523, 1.168, -0.744, 0.209, -0.168, -0.316, 0.207, -0.432, 0.047, -0.646, -0.664], [-0.185, -0.613, -1.695, 1.602, -0.32, -0.277, 0.967, 0.728, -0.965, -0.234, 1.069, -0.63, -1.631, 0.711, 0.426, 1.298, -0.191, -0.467, -0.771, 0.971, -0.118, -1.577, -2.064, -0.055, -0.59, 0.642, -0.997, 1.251, 0.538, 1.367, 0.106, 1.704]]]) # fmt: skip
@require_torch
def test_tokenization_small_model_pt(self):
feature_extractor = pipeline(
@ -102,46 +88,6 @@ class FeatureExtractionPipelineTests(unittest.TestCase):
tokenize_kwargs=tokenize_kwargs,
)
@require_tf
def test_tokenization_small_model_tf(self):
feature_extractor = pipeline(
task="feature-extraction", model="hf-internal-testing/tiny-random-distilbert", framework="tf"
)
# test with empty parameters
outputs = feature_extractor("This is a test")
self.assertEqual(
nested_simplify(outputs),
[[[2.287, 1.234, 0.042, 1.53, 1.306, 0.879, -0.526, -1.71, -1.276, 0.756, -0.775, -1.048, -0.25, -0.595, -0.137, -0.598, 2.022, -0.812, 0.284, -0.488, -0.391, -0.403, -0.525, -0.061, -0.228, 1.086, 0.378, -0.14, 0.599, -0.087, -2.259, -0.098], [1.676, 0.232, -1.508, -0.145, 1.798, -1.388, 1.331, -0.37, -0.939, 0.043, 0.06, -0.414, -1.408, 0.24, 0.622, -0.55, -0.569, 1.873, -0.706, 1.924, -0.254, 1.927, -0.423, 0.152, -0.952, 0.509, -0.496, -0.968, 0.093, -1.049, -0.65, 0.312], [0.207, -0.775, -1.822, 0.321, -0.71, -0.201, 0.3, 1.146, -0.233, -0.753, -0.305, 1.309, -1.47, -0.21, 1.802, -1.555, -1.175, 1.323, -0.303, 0.722, -0.076, 0.103, -1.406, 1.931, 0.091, 0.237, 1.172, 1.607, 0.253, -0.9, -1.068, 0.438], [0.615, 1.077, 0.171, -0.175, 1.3, 0.901, -0.653, -0.138, 0.341, -0.654, -0.184, -0.441, -0.424, 0.356, -0.075, 0.26, -1.023, 0.814, 0.524, -0.904, -0.204, -0.623, 1.234, -1.03, 2.594, 0.56, 1.831, -0.199, -1.508, -0.492, -1.687, -2.165], [0.129, 0.008, -1.279, -0.412, -0.004, 1.663, 0.196, 0.104, 0.123, 0.119, 0.635, 1.757, 2.334, -0.799, -1.626, -1.26, 0.595, -0.316, -1.399, 0.232, 0.264, 1.386, -1.171, -0.256, -0.256, -1.944, 1.168, -0.368, -0.714, -0.51, 0.454, 1.148], [-0.32, 0.29, -1.309, -0.177, 0.453, 0.636, -0.024, 0.509, 0.931, -1.754, -1.575, 0.786, 0.046, -1.165, -1.416, 1.373, 1.293, -0.285, -1.541, -1.186, -0.106, -0.994, 2.001, 0.972, -0.02, 1.654, -0.236, 0.643, 1.02, 0.572, -0.914, -0.154], [0.7, -0.937, 0.441, 0.25, 0.78, -0.022, 0.282, -0.095, 1.558, -0.336, 1.706, 0.884, 1.28, 0.198, -0.796, 1.218, -1.769, 1.197, -0.342, -0.177, -0.645, 1.364, 0.008, -0.597, -0.484, -2.772, -0.696, -0.632, -0.34, -1.527, -0.562, 0.862], [2.504, 0.831, -1.271, -0.033, 0.298, -0.735, 1.339, 1.74, 0.233, -1.424, -0.819, -0.761, 0.291, 0.853, -0.092, -0.885, 0.164, 1.025, 0.907, 0.749, -1.515, -0.545, -1.365, 0.271, 0.034, -2.005, 0.031, 0.244, 0.621, 0.176, 0.336, -1.196], [-0.711, 0.591, -1.001, -0.946, 0.784, -1.66, 1.545, 0.799, -0.857, 1.148, 0.213, -0.285, 0.464, -0.139, 0.79, -1.663, -1.121, 0.575, -0.178, -0.508, 1.565, -0.242, -0.346, 1.024, -1.135, -0.158, -2.101, 0.275, 2.009, -0.425, 0.716, 0.981], [0.912, -1.186, -0.846, -0.421, -1.315, -0.827, 0.309, 0.533, 1.029, -2.343, 1.513, -1.238, 1.487, -0.849, 0.896, -0.927, -0.459, 0.159, 0.177, 0.873, 0.935, 1.433, -0.485, 0.737, 1.327, -0.338, 1.608, -0.47, -0.445, -1.118, -0.213, -0.446], [-0.434, -1.362, -1.098, -1.068, 1.507, 0.003, 0.413, -0.395, 0.897, -0.237, 1.405, -0.344, 1.693, 0.677, 0.097, -0.257, -0.602, 1.026, -1.229, 0.855, -0.713, 1.014, 0.443, 0.238, 0.425, -2.184, 1.933, -1.157, -1.132, -0.597, -0.785, 0.967], [0.58, -0.971, 0.789, -0.468, -0.576, 1.779, 1.747, 1.715, -1.939, 0.125, 0.656, -0.042, -1.024, -1.767, 0.107, -0.408, -0.866, -1.774, 1.248, 0.939, -0.033, 1.523, 1.168, -0.744, 0.209, -0.168, -0.316, 0.207, -0.432, 0.047, -0.646, -0.664], [-0.185, -0.613, -1.695, 1.602, -0.32, -0.277, 0.967, 0.728, -0.965, -0.234, 1.069, -0.63, -1.631, 0.711, 0.426, 1.298, -0.191, -0.467, -0.771, 0.971, -0.118, -1.577, -2.064, -0.055, -0.59, 0.642, -0.997, 1.251, 0.538, 1.367, 0.106, 1.704]]]) # fmt: skip
# test with various tokenizer parameters
tokenize_kwargs = {"max_length": 3}
outputs = feature_extractor("This is a test", tokenize_kwargs=tokenize_kwargs)
self.assertEqual(np.squeeze(outputs).shape, (3, 32))
tokenize_kwargs = {"truncation": True, "padding": True, "max_length": 4}
outputs = feature_extractor(
["This is a test", "This", "This is", "This is a", "This is a test test test test"],
tokenize_kwargs=tokenize_kwargs,
)
self.assertEqual(np.squeeze(outputs).shape, (5, 4, 32))
tokenize_kwargs = {"padding": True, "max_length": 4}
outputs = feature_extractor(
["This is a test", "This", "This is", "This is a", "This is a test test test test"],
truncation=True,
tokenize_kwargs=tokenize_kwargs,
)
self.assertEqual(np.squeeze(outputs).shape, (5, 4, 32))
# raise value error if truncation parameter given for two places
tokenize_kwargs = {"truncation": True}
with self.assertRaises(ValueError):
_ = feature_extractor(
["This is a test", "This", "This is", "This is a", "This is a test test test test"],
truncation=True,
tokenize_kwargs=tokenize_kwargs,
)
@require_torch
def test_return_tensors_pt(self):
feature_extractor = pipeline(
@ -150,14 +96,6 @@ class FeatureExtractionPipelineTests(unittest.TestCase):
outputs = feature_extractor("This is a test", return_tensors=True)
self.assertTrue(torch.is_tensor(outputs))
@require_tf
def test_return_tensors_tf(self):
feature_extractor = pipeline(
task="feature-extraction", model="hf-internal-testing/tiny-random-distilbert", framework="tf"
)
outputs = feature_extractor("This is a test", return_tensors=True)
self.assertTrue(tf.is_tensor(outputs))
def get_shape(self, input_, shape=None):
if shape is None:
shape = []

View File

@ -22,7 +22,6 @@ from transformers.testing_utils import (
is_pipeline_test,
is_torch_available,
nested_simplify,
require_tf,
require_torch,
require_torch_accelerator,
slow,
@ -44,47 +43,6 @@ class FillMaskPipelineTests(unittest.TestCase):
if is_torch_available():
backend_empty_cache(torch_device)
@require_tf
def test_small_model_tf(self):
unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", top_k=2, framework="tf")
outputs = unmasker("My name is <mask>")
self.assertEqual(
nested_simplify(outputs, decimals=6),
[
{"sequence": "My name is grouped", "score": 2.1e-05, "token": 38015, "token_str": " grouped"},
{"sequence": "My name is accuser", "score": 2.1e-05, "token": 25506, "token_str": " accuser"},
],
)
outputs = unmasker("The largest city in France is <mask>")
self.assertEqual(
nested_simplify(outputs, decimals=6),
[
{
"sequence": "The largest city in France is grouped",
"score": 2.1e-05,
"token": 38015,
"token_str": " grouped",
},
{
"sequence": "The largest city in France is accuser",
"score": 2.1e-05,
"token": 25506,
"token_str": " accuser",
},
],
)
outputs = unmasker("My name is <mask>", targets=[" Patrick", " Clara", " Teven"], top_k=3)
self.assertEqual(
nested_simplify(outputs, decimals=6),
[
{"sequence": "My name is Clara", "score": 2e-05, "token": 13606, "token_str": " Clara"},
{"sequence": "My name is Patrick", "score": 2e-05, "token": 3499, "token_str": " Patrick"},
{"sequence": "My name is Te", "score": 1.9e-05, "token": 2941, "token_str": " Te"},
],
)
@require_torch
def test_small_model_pt(self):
unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", top_k=2, framework="pt")
@ -172,12 +130,6 @@ class FillMaskPipelineTests(unittest.TestCase):
unmasker = pipeline(task="fill-mask", model="distilbert/distilroberta-base", top_k=2, framework="pt")
self.run_large_test(unmasker)
@slow
@require_tf
def test_large_model_tf(self):
unmasker = pipeline(task="fill-mask", model="distilbert/distilroberta-base", top_k=2, framework="tf")
self.run_large_test(unmasker)
def run_large_test(self, unmasker):
outputs = unmasker("My name is <mask>")
self.assertEqual(
@ -244,13 +196,6 @@ class FillMaskPipelineTests(unittest.TestCase):
unmasker.tokenizer.pad_token = None
self.run_pipeline_test(unmasker, [])
@require_tf
def test_model_no_pad_tf(self):
unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", framework="tf")
unmasker.tokenizer.pad_token_id = None
unmasker.tokenizer.pad_token = None
self.run_pipeline_test(unmasker, [])
def get_test_pipeline(
self,
model,

View File

@ -29,7 +29,6 @@ from transformers.testing_utils import (
compare_pipeline_output_to_hub_spec,
is_pipeline_test,
nested_simplify,
require_tf,
require_torch,
require_torch_or_tf,
require_vision,
@ -175,32 +174,6 @@ class ImageClassificationPipelineTests(unittest.TestCase):
],
)
@require_tf
def test_small_model_tf(self):
small_model = "hf-internal-testing/tiny-random-vit"
image_classifier = pipeline("image-classification", model=small_model, framework="tf")
outputs = image_classifier("http://images.cocodataset.org/val2017/000000039769.jpg")
self.assertEqual(
nested_simplify(outputs, decimals=4),
[{"label": "LABEL_1", "score": 0.574}, {"label": "LABEL_0", "score": 0.426}],
)
outputs = image_classifier(
[
"http://images.cocodataset.org/val2017/000000039769.jpg",
"http://images.cocodataset.org/val2017/000000039769.jpg",
],
top_k=2,
)
self.assertEqual(
nested_simplify(outputs, decimals=4),
[
[{"label": "LABEL_1", "score": 0.574}, {"label": "LABEL_0", "score": 0.426}],
[{"label": "LABEL_1", "score": 0.574}, {"label": "LABEL_0", "score": 0.426}],
],
)
def test_custom_tokenizer(self):
tokenizer = PreTrainedTokenizerBase()

View File

@ -22,20 +22,16 @@ from transformers import (
TF_MODEL_MAPPING,
TOKENIZER_MAPPING,
ImageFeatureExtractionPipeline,
is_tf_available,
is_torch_available,
is_vision_available,
pipeline,
)
from transformers.testing_utils import is_pipeline_test, nested_simplify, require_tf, require_torch
from transformers.testing_utils import is_pipeline_test, nested_simplify, require_torch
if is_torch_available():
import torch
if is_tf_available():
import tensorflow as tf
if is_vision_available():
from PIL import Image
@ -73,28 +69,6 @@ class ImageFeatureExtractionPipelineTests(unittest.TestCase):
nested_simplify(outputs[0]),
[-0.056, 0.083, 0.021, 0.038, 0.242, -0.279, -0.033, -0.003, 0.200, -0.192, 0.045, -0.095, -0.077, 0.017, -0.058, -0.063, -0.029, -0.204, 0.014, 0.042, 0.305, -0.205, -0.099, 0.146, -0.287, 0.020, 0.168, -0.052, 0.046, 0.048, -0.156, 0.093]) # fmt: skip
@require_tf
def test_small_model_tf(self):
feature_extractor = pipeline(
task="image-feature-extraction", model="hf-internal-testing/tiny-random-vit-w-pooler", framework="tf"
)
img = prepare_img()
outputs = feature_extractor(img)
self.assertEqual(
nested_simplify(outputs[0][0]),
[-1.417, -0.392, -1.264, -1.196, 1.648, 0.885, 0.56, -0.606, -1.175, 0.823, 1.912, 0.081, -0.053, 1.119, -0.062, -1.757, -0.571, 0.075, 0.959, 0.118, 1.201, -0.672, -0.498, 0.364, 0.937, -1.623, 0.228, 0.19, 1.697, -1.115, 0.583, -0.981]) # fmt: skip
@require_tf
def test_small_model_w_pooler_tf(self):
feature_extractor = pipeline(
task="image-feature-extraction", model="hf-internal-testing/tiny-random-vit-w-pooler", framework="tf"
)
img = prepare_img()
outputs = feature_extractor(img, pool=True)
self.assertEqual(
nested_simplify(outputs[0]),
[-0.056, 0.083, 0.021, 0.038, 0.242, -0.279, -0.033, -0.003, 0.200, -0.192, 0.045, -0.095, -0.077, 0.017, -0.058, -0.063, -0.029, -0.204, 0.014, 0.042, 0.305, -0.205, -0.099, 0.146, -0.287, 0.020, 0.168, -0.052, 0.046, 0.048, -0.156, 0.093]) # fmt: skip
@require_torch
def test_image_processing_small_model_pt(self):
feature_extractor = pipeline(
@ -117,28 +91,6 @@ class ImageFeatureExtractionPipelineTests(unittest.TestCase):
outputs = feature_extractor(img, pool=True)
self.assertEqual(np.squeeze(outputs).shape, (32,))
@require_tf
def test_image_processing_small_model_tf(self):
feature_extractor = pipeline(
task="image-feature-extraction", model="hf-internal-testing/tiny-random-vit", framework="tf"
)
# test with image processor parameters
image_processor_kwargs = {"size": {"height": 300, "width": 300}}
img = prepare_img()
with pytest.raises(ValueError):
# Image doesn't match model input size
feature_extractor(img, image_processor_kwargs=image_processor_kwargs)
image_processor_kwargs = {"image_mean": [0, 0, 0], "image_std": [1, 1, 1]}
img = prepare_img()
outputs = feature_extractor(img, image_processor_kwargs=image_processor_kwargs)
self.assertEqual(np.squeeze(outputs).shape, (226, 32))
# Test pooling option
outputs = feature_extractor(img, pool=True)
self.assertEqual(np.squeeze(outputs).shape, (32,))
@require_torch
def test_return_tensors_pt(self):
feature_extractor = pipeline(
@ -148,15 +100,6 @@ class ImageFeatureExtractionPipelineTests(unittest.TestCase):
outputs = feature_extractor(img, return_tensors=True)
self.assertTrue(torch.is_tensor(outputs))
@require_tf
def test_return_tensors_tf(self):
feature_extractor = pipeline(
task="image-feature-extraction", model="hf-internal-testing/tiny-random-vit", framework="tf"
)
img = prepare_img()
outputs = feature_extractor(img, return_tensors=True)
self.assertTrue(tf.is_tensor(outputs))
def get_test_pipeline(
self,
model,

View File

@ -39,7 +39,6 @@ from transformers.testing_utils import (
compare_pipeline_output_to_hub_spec,
is_pipeline_test,
nested_simplify,
require_tf,
require_timm,
require_torch,
require_vision,
@ -202,11 +201,6 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
for output_element in single_output:
compare_pipeline_output_to_hub_spec(output_element, ImageSegmentationOutputElement)
@require_tf
@unittest.skip(reason="Image segmentation not implemented in TF")
def test_small_model_tf(self):
pass
@require_torch
def test_small_model_pt_no_panoptic(self):
model_id = "hf-internal-testing/tiny-random-mobilevit"

View File

@ -29,7 +29,6 @@ from transformers.testing_utils import (
Expectations,
is_pipeline_test,
nested_simplify,
require_tf,
require_torch,
require_vision,
slow,
@ -103,11 +102,6 @@ class MaskGenerationPipelineTests(unittest.TestCase):
def run_pipeline_test(self, mask_generator, examples):
pass
@require_tf
@unittest.skip(reason="Image segmentation not implemented in TF")
def test_small_model_tf(self):
pass
@slow
@require_torch
def test_small_model_pt(self):

View File

@ -30,7 +30,6 @@ from transformers.testing_utils import (
is_pipeline_test,
nested_simplify,
require_pytesseract,
require_tf,
require_timm,
require_torch,
require_vision,
@ -128,11 +127,6 @@ class ObjectDetectionPipelineTests(unittest.TestCase):
)
compare_pipeline_output_to_hub_spec(detected_object, ObjectDetectionOutputElement)
@require_tf
@unittest.skip(reason="Object detection not implemented in TF")
def test_small_model_tf(self):
pass
@require_torch
def test_small_model_pt(self):
model_id = "hf-internal-testing/tiny-detr-mobilenetsv3"

View File

@ -29,7 +29,6 @@ from transformers.testing_utils import (
is_pipeline_test,
is_torch_available,
nested_simplify,
require_tf,
require_torch,
require_torch_or_tf,
slow,
@ -296,17 +295,6 @@ class QAPipelineTests(unittest.TestCase):
answers = [output["answer"] for output in outputs]
self.assertEqual(len(answers), len(set(answers)), "There are duplicate answers in the outputs.")
@require_tf
def test_small_model_tf(self):
question_answerer = pipeline(
"question-answering", model="sshleifer/tiny-distilbert-base-cased-distilled-squad", framework="tf"
)
outputs = question_answerer(
question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris."
)
self.assertEqual(nested_simplify(outputs), {"score": 0.011, "start": 0, "end": 11, "answer": "HuggingFace"})
@slow
@require_torch
def test_large_model_pt(self):
@ -421,16 +409,6 @@ between them. It's straightforward to train your models with one before loading
{"answer": "Jax, PyTorch and TensorFlow", "end": 1919, "score": 0.971, "start": 1892},
)
@slow
@require_tf
def test_large_model_tf(self):
question_answerer = pipeline("question-answering", framework="tf")
outputs = question_answerer(
question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris."
)
self.assertEqual(nested_simplify(outputs), {"score": 0.979, "start": 27, "end": 32, "answer": "Paris"})
@require_torch_or_tf
class QuestionAnsweringArgumentHandlerTests(unittest.TestCase):

View File

@ -26,7 +26,6 @@ from transformers.testing_utils import (
is_pipeline_test,
require_pandas,
require_tensorflow_probability,
require_tf,
require_torch,
slow,
)
@ -38,111 +37,6 @@ class TQAPipelineTests(unittest.TestCase):
# which are needed to generate automatic tests
model_mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING
@require_tensorflow_probability
@require_pandas
@require_tf
@require_torch
def test_small_model_tf(self):
model_id = "lysandre/tiny-tapas-random-wtq"
model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id, from_pt=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
self.assertIsInstance(model.config.aggregation_labels, dict)
self.assertIsInstance(model.config.no_aggregation_label_index, int)
table_querier = TableQuestionAnsweringPipeline(model=model, tokenizer=tokenizer, max_new_tokens=20)
outputs = table_querier(
table={
"actors": ["brad pitt", "leonardo di caprio", "george clooney"],
"age": ["56", "45", "59"],
"number of movies": ["87", "53", "69"],
"date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
},
query="how many movies has george clooney played in?",
)
self.assertEqual(
outputs,
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
)
outputs = table_querier(
table={
"actors": ["brad pitt", "leonardo di caprio", "george clooney"],
"age": ["56", "45", "59"],
"number of movies": ["87", "53", "69"],
"date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
},
query=["how many movies has george clooney played in?", "how old is he?", "what's his date of birth?"],
)
self.assertEqual(
outputs,
[
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
],
)
outputs = table_querier(
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
},
query=[
"What repository has the largest number of stars?",
"Given that the numbers of stars defines if a repository is active, what repository is the most"
" active?",
"What is the number of repositories?",
"What is the average number of stars?",
"What is the total amount of stars?",
],
)
self.assertEqual(
outputs,
[
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
],
)
with self.assertRaises(ValueError):
table_querier(query="What does it do with empty context ?", table=None)
with self.assertRaises(ValueError):
table_querier(query="What does it do with empty context ?", table="")
with self.assertRaises(ValueError):
table_querier(query="What does it do with empty context ?", table={})
with self.assertRaises(ValueError):
table_querier(
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
}
)
with self.assertRaises(ValueError):
table_querier(
query="",
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
},
)
with self.assertRaises(ValueError):
table_querier(
query=None,
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
},
)
@require_torch
def test_small_model_pt(self, torch_dtype="float32"):
model_id = "lysandre/tiny-tapas-random-wtq"
@ -372,128 +266,6 @@ class TQAPipelineTests(unittest.TestCase):
def test_slow_tokenizer_sqa_pt_fp16(self):
self.test_slow_tokenizer_sqa_pt(torch_dtype="float16")
@require_tf
@require_tensorflow_probability
@require_pandas
@require_torch
def test_slow_tokenizer_sqa_tf(self):
model_id = "lysandre/tiny-tapas-random-sqa"
model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id, from_pt=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
table_querier = TableQuestionAnsweringPipeline(model=model, tokenizer=tokenizer, max_new_tokens=20)
inputs = {
"table": {
"actors": ["brad pitt", "leonardo di caprio", "george clooney"],
"age": ["56", "45", "59"],
"number of movies": ["87", "53", "69"],
"date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
},
"query": ["how many movies has george clooney played in?", "how old is he?", "what's his date of birth?"],
}
sequential_outputs = table_querier(**inputs, sequential=True)
batch_outputs = table_querier(**inputs, sequential=False)
self.assertEqual(len(sequential_outputs), 3)
self.assertEqual(len(batch_outputs), 3)
self.assertEqual(sequential_outputs[0], batch_outputs[0])
self.assertNotEqual(sequential_outputs[1], batch_outputs[1])
# self.assertNotEqual(sequential_outputs[2], batch_outputs[2])
table_querier = TableQuestionAnsweringPipeline(model=model, tokenizer=tokenizer, max_new_tokens=20)
outputs = table_querier(
table={
"actors": ["brad pitt", "leonardo di caprio", "george clooney"],
"age": ["56", "45", "59"],
"number of movies": ["87", "53", "69"],
"date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
},
query="how many movies has george clooney played in?",
)
self.assertEqual(
outputs,
{"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]},
)
outputs = table_querier(
table={
"actors": ["brad pitt", "leonardo di caprio", "george clooney"],
"age": ["56", "45", "59"],
"number of movies": ["87", "53", "69"],
"date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
},
query=["how many movies has george clooney played in?", "how old is he?", "what's his date of birth?"],
)
self.assertEqual(
outputs,
[
{"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]},
{"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]},
{"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]},
],
)
outputs = table_querier(
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
},
query=[
"What repository has the largest number of stars?",
"Given that the numbers of stars defines if a repository is active, what repository is the most"
" active?",
"What is the number of repositories?",
"What is the average number of stars?",
"What is the total amount of stars?",
],
)
self.assertEqual(
outputs,
[
{"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
{"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
{"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
{"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
{"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
],
)
with self.assertRaises(ValueError):
table_querier(query="What does it do with empty context ?", table=None)
with self.assertRaises(ValueError):
table_querier(query="What does it do with empty context ?", table="")
with self.assertRaises(ValueError):
table_querier(query="What does it do with empty context ?", table={})
with self.assertRaises(ValueError):
table_querier(
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
}
)
with self.assertRaises(ValueError):
table_querier(
query="",
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
},
)
with self.assertRaises(ValueError):
table_querier(
query=None,
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
},
)
@slow
@require_torch
def test_integration_wtq_pt(self, torch_dtype="float32"):

View File

@ -24,7 +24,6 @@ from transformers.testing_utils import (
is_pipeline_test,
is_torch_available,
nested_simplify,
require_tf,
require_torch,
require_torch_bf16,
require_torch_fp16,
@ -152,15 +151,6 @@ class TextClassificationPipelineTests(unittest.TestCase):
outputs = text_classifier("This is great !")
self.assertEqual(nested_simplify(outputs), [{"label": "LABEL_0", "score": 0.504}])
@require_tf
def test_small_model_tf(self):
text_classifier = pipeline(
task="text-classification", model="hf-internal-testing/tiny-random-distilbert", framework="tf"
)
outputs = text_classifier("This is great !")
self.assertEqual(nested_simplify(outputs), [{"label": "LABEL_0", "score": 0.504}])
@slow
@require_torch
def test_pt_bert(self):
@ -173,18 +163,6 @@ class TextClassificationPipelineTests(unittest.TestCase):
outputs = text_classifier("Birds are a type of animal")
self.assertEqual(nested_simplify(outputs), [{"label": "POSITIVE", "score": 0.988}])
@slow
@require_tf
def test_tf_bert(self):
text_classifier = pipeline("text-classification", framework="tf")
outputs = text_classifier("This is great !")
self.assertEqual(nested_simplify(outputs), [{"label": "POSITIVE", "score": 1.0}])
outputs = text_classifier("This is bad !")
self.assertEqual(nested_simplify(outputs), [{"label": "NEGATIVE", "score": 1.0}])
outputs = text_classifier("Birds are a type of animal")
self.assertEqual(nested_simplify(outputs), [{"label": "POSITIVE", "score": 0.988}])
def get_test_pipeline(
self,
model,

View File

@ -29,7 +29,6 @@ from transformers.testing_utils import (
is_pipeline_test,
is_torch_available,
nested_simplify,
require_tf,
require_torch,
require_torch_accelerator,
slow,
@ -823,26 +822,6 @@ class TokenClassificationPipelineTests(unittest.TestCase):
[("▁I", False), ("▁play", False), ("▁the", False), ("▁there", False), ("min", True)],
)
@require_tf
def test_tf_only(self):
model_name = "hf-internal-testing/tiny-random-bert-tf-only" # This model only has a TensorFlow version
# We test that if we don't specify framework='tf', it gets detected automatically
token_classifier = pipeline(task="ner", model=model_name)
self.assertEqual(token_classifier.framework, "tf")
@require_tf
def test_small_model_tf(self):
model_name = "hf-internal-testing/tiny-bert-for-token-classification"
token_classifier = pipeline(task="token-classification", model=model_name, framework="tf")
outputs = token_classifier("This is a test !")
self.assertEqual(
nested_simplify(outputs),
[
{"entity": "I-MISC", "score": 0.115, "index": 1, "word": "this", "start": 0, "end": 4},
{"entity": "I-MISC", "score": 0.115, "index": 2, "word": "is", "start": 5, "end": 7},
],
)
@require_torch
def test_no_offset_tokenizer(self):
model_name = "hf-internal-testing/tiny-bert-for-token-classification"

View File

@ -23,7 +23,6 @@ from transformers.testing_utils import (
is_pipeline_test,
nested_simplify,
require_av,
require_tf,
require_torch,
require_torch_or_tf,
require_vision,
@ -124,8 +123,3 @@ class VideoClassificationPipelineTests(unittest.TestCase):
for output in outputs:
for element in output:
compare_pipeline_output_to_hub_spec(element, VideoClassificationOutputElement)
@require_tf
@unittest.skip
def test_small_model_tf(self):
pass

View File

@ -22,7 +22,6 @@ from transformers.testing_utils import (
is_pipeline_test,
is_torch_available,
nested_simplify,
require_tf,
require_torch,
require_torch_accelerator,
require_vision,
@ -246,8 +245,3 @@ class VisualQuestionAnsweringPipelineTests(unittest.TestCase):
[{"score": ANY(float), "answer": ANY(str)}],
],
)
@require_tf
@unittest.skip(reason="Visual question answering not implemented in TF")
def test_small_model_tf(self):
pass

View File

@ -25,7 +25,6 @@ from transformers.testing_utils import (
is_pipeline_test,
is_torch_available,
nested_simplify,
require_tf,
require_torch,
slow,
)
@ -243,26 +242,6 @@ class ZeroShotClassificationPipelineTests(unittest.TestCase):
},
)
@require_tf
def test_small_model_tf(self):
zero_shot_classifier = pipeline(
"zero-shot-classification",
model="sshleifer/tiny-distilbert-base-cased-distilled-squad",
framework="tf",
)
outputs = zero_shot_classifier(
"Who are you voting for in 2020?", candidate_labels=["politics", "public health", "science"]
)
self.assertEqual(
nested_simplify(outputs),
{
"sequence": "Who are you voting for in 2020?",
"labels": ["science", "public health", "politics"],
"scores": [0.333, 0.333, 0.333],
},
)
@slow
@require_torch
def test_large_model_pt(self):
@ -319,60 +298,3 @@ class ZeroShotClassificationPipelineTests(unittest.TestCase):
"scores": [0.817, 0.713, 0.018, 0.018],
},
)
@slow
@require_tf
def test_large_model_tf(self):
zero_shot_classifier = pipeline(
"zero-shot-classification", model="FacebookAI/roberta-large-mnli", framework="tf"
)
outputs = zero_shot_classifier(
"Who are you voting for in 2020?", candidate_labels=["politics", "public health", "science"]
)
self.assertEqual(
nested_simplify(outputs),
{
"sequence": "Who are you voting for in 2020?",
"labels": ["politics", "public health", "science"],
"scores": [0.976, 0.015, 0.009],
},
)
outputs = zero_shot_classifier(
"The dominant sequence transduction models are based on complex recurrent or convolutional neural networks"
" in an encoder-decoder configuration. The best performing models also connect the encoder and decoder"
" through an attention mechanism. We propose a new simple network architecture, the Transformer, based"
" solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two"
" machine translation tasks show these models to be superior in quality while being more parallelizable"
" and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014"
" English-to-German translation task, improving over the existing best results, including ensembles by"
" over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new"
" single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small"
" fraction of the training costs of the best models from the literature. We show that the Transformer"
" generalizes well to other tasks by applying it successfully to English constituency parsing both with"
" large and limited training data.",
candidate_labels=["machine learning", "statistics", "translation", "vision"],
multi_label=True,
)
self.assertEqual(
nested_simplify(outputs),
{
"sequence": (
"The dominant sequence transduction models are based on complex recurrent or convolutional neural"
" networks in an encoder-decoder configuration. The best performing models also connect the"
" encoder and decoder through an attention mechanism. We propose a new simple network"
" architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence"
" and convolutions entirely. Experiments on two machine translation tasks show these models to be"
" superior in quality while being more parallelizable and requiring significantly less time to"
" train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task,"
" improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014"
" English-to-French translation task, our model establishes a new single-model state-of-the-art"
" BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training"
" costs of the best models from the literature. We show that the Transformer generalizes well to"
" other tasks by applying it successfully to English constituency parsing both with large and"
" limited training data."
),
"labels": ["translation", "machine learning", "vision", "statistics"],
"scores": [0.817, 0.713, 0.018, 0.018],
},
)

View File

@ -22,7 +22,6 @@ from transformers.testing_utils import (
compare_pipeline_output_to_hub_spec,
is_pipeline_test,
nested_simplify,
require_tf,
require_torch,
require_vision,
slow,
@ -137,57 +136,6 @@ class ZeroShotImageClassificationPipelineTests(unittest.TestCase):
def test_small_model_pt_fp16(self):
self.test_small_model_pt(torch_dtype="float16")
@require_tf
def test_small_model_tf(self):
image_classifier = pipeline(
model="hf-internal-testing/tiny-random-clip-zero-shot-image-classification", framework="tf"
)
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
output = image_classifier(image, candidate_labels=["a", "b", "c"])
self.assertEqual(
nested_simplify(output),
[{"score": 0.333, "label": "a"}, {"score": 0.333, "label": "b"}, {"score": 0.333, "label": "c"}],
)
output = image_classifier([image] * 5, candidate_labels=["A", "B", "C"], batch_size=2)
self.assertEqual(
nested_simplify(output),
# Pipeline outputs are supposed to be deterministic and
# So we could in theory have real values "A", "B", "C" instead
# of ANY(str).
# However it seems that in this particular case, the floating
# scores are so close, we enter floating error approximation
# and the order is not guaranteed anymore with batching.
[
[
{"score": 0.333, "label": ANY(str)},
{"score": 0.333, "label": ANY(str)},
{"score": 0.333, "label": ANY(str)},
],
[
{"score": 0.333, "label": ANY(str)},
{"score": 0.333, "label": ANY(str)},
{"score": 0.333, "label": ANY(str)},
],
[
{"score": 0.333, "label": ANY(str)},
{"score": 0.333, "label": ANY(str)},
{"score": 0.333, "label": ANY(str)},
],
[
{"score": 0.333, "label": ANY(str)},
{"score": 0.333, "label": ANY(str)},
{"score": 0.333, "label": ANY(str)},
],
[
{"score": 0.333, "label": ANY(str)},
{"score": 0.333, "label": ANY(str)},
{"score": 0.333, "label": ANY(str)},
],
],
)
@slow
@require_torch
def test_large_model_pt(self):
@ -221,37 +169,6 @@ class ZeroShotImageClassificationPipelineTests(unittest.TestCase):
* 5,
)
@slow
@require_tf
def test_large_model_tf(self):
image_classifier = pipeline(
task="zero-shot-image-classification", model="openai/clip-vit-base-patch32", framework="tf"
)
# This is an image of 2 cats with remotes and no planes
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
output = image_classifier(image, candidate_labels=["cat", "plane", "remote"])
self.assertEqual(
nested_simplify(output),
[
{"score": 0.511, "label": "remote"},
{"score": 0.485, "label": "cat"},
{"score": 0.004, "label": "plane"},
],
)
output = image_classifier([image] * 5, candidate_labels=["cat", "plane", "remote"], batch_size=2)
self.assertEqual(
nested_simplify(output),
[
[
{"score": 0.511, "label": "remote"},
{"score": 0.485, "label": "cat"},
{"score": 0.004, "label": "plane"},
],
]
* 5,
)
@slow
@require_torch
def test_siglip_model_pt(self):

View File

@ -23,7 +23,6 @@ from transformers import (
from transformers.testing_utils import (
is_pipeline_test,
nested_simplify,
require_tf,
require_torch,
require_vision,
slow,
@ -90,11 +89,6 @@ class ZeroShotObjectDetectionPipelineTests(unittest.TestCase):
],
)
@require_tf
@unittest.skip(reason="Zero Shot Object Detection not implemented in TF")
def test_small_model_tf(self):
pass
@require_torch
def test_small_model_pt(self):
object_detector = pipeline(
@ -201,11 +195,6 @@ class ZeroShotObjectDetectionPipelineTests(unittest.TestCase):
],
)
@require_tf
@unittest.skip(reason="Zero Shot Object Detection not implemented in TF")
def test_large_model_tf(self):
pass
@require_torch
@slow
def test_threshold(self):

View File

@ -17,16 +17,13 @@ import unittest
import numpy as np
from parameterized import parameterized
from transformers.testing_utils import require_flax, require_tf, require_torch, require_vision
from transformers.utils.import_utils import is_flax_available, is_tf_available, is_torch_available, is_vision_available
from transformers.testing_utils import require_flax, require_torch, require_vision
from transformers.utils.import_utils import is_flax_available, is_torch_available, is_vision_available
if is_torch_available():
import torch
if is_tf_available():
import tensorflow as tf
if is_flax_available():
import jax
@ -122,20 +119,6 @@ class ImageTransformsTester(unittest.TestCase):
self.assertTrue(np_img.min() == 0)
self.assertTrue(np_img.max() == 1)
@require_tf
def test_to_pil_image_from_tensorflow(self):
# channels_first
image = tf.random.uniform((3, 4, 5))
pil_image = to_pil_image(image)
self.assertIsInstance(pil_image, PIL.Image.Image)
self.assertEqual(pil_image.size, (5, 4))
# channels_last
image = tf.random.uniform((4, 5, 3))
pil_image = to_pil_image(image)
self.assertIsInstance(pil_image, PIL.Image.Image)
self.assertEqual(pil_image.size, (5, 4))
@require_torch
def test_to_pil_image_from_torch(self):
# channels first

View File

@ -16,7 +16,7 @@
import numpy as np
from transformers import BatchFeature
from transformers.testing_utils import require_tf, require_torch
from transformers.testing_utils import require_torch
from .test_feature_extraction_common import FeatureExtractionSavingTestMixin
@ -76,24 +76,6 @@ class SequenceFeatureExtractionTestMixin(FeatureExtractionSavingTestMixin):
== (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.feature_size)
)
@require_tf
def test_batch_feature_tf(self):
speech_inputs = self.feat_extract_tester.prepare_inputs_for_common(equal_length=True)
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
input_name = feat_extract.model_input_names[0]
processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="tf")
batch_features_input = processed_features[input_name]
if len(batch_features_input.shape) < 3:
batch_features_input = batch_features_input[:, :, None]
self.assertTrue(
batch_features_input.shape
== (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.feature_size)
)
def _check_padding(self, numpify=False):
def _inputs_have_equal_length(input):
length = len(input[0])
@ -372,19 +354,6 @@ class SequenceFeatureExtractionTestMixin(FeatureExtractionSavingTestMixin):
self.assertTrue(abs(input_np.astype(np.float32).sum() - input_pt.numpy().astype(np.float32).sum()) < 1e-2)
@require_tf
def test_padding_accepts_tensors_tf(self):
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
speech_inputs = self.feat_extract_tester.prepare_inputs_for_common()
input_name = feat_extract.model_input_names[0]
processed_features = BatchFeature({input_name: speech_inputs})
input_np = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name]
input_tf = feat_extract.pad(processed_features, padding="longest", return_tensors="tf")[input_name]
self.assertTrue(abs(input_np.astype(np.float32).sum() - input_tf.numpy().astype(np.float32).sum()) < 1e-2)
def test_attention_mask(self):
feat_dict = self.feat_extract_dict
feat_dict["return_attention_mask"] = True

View File

@ -53,7 +53,6 @@ from transformers.testing_utils import (
get_tests_dir,
require_jinja,
require_read_token,
require_tf,
require_tokenizers,
require_torch,
run_test_in_subprocess,
@ -3106,40 +3105,6 @@ class TokenizerTesterMixin:
# model(**encoded_sequence_fast)
# model(**batch_encoded_sequence_fast)
@require_tf
@slow
def test_tf_encode_plus_sent_to_model(self):
from transformers import TF_MODEL_MAPPING, TOKENIZER_MAPPING
MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(TF_MODEL_MAPPING, TOKENIZER_MAPPING)
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
self.skipTest(f"{tokenizer.__class__.__name__} is not in the MODEL_TOKENIZER_MAPPING")
config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
config = config_class()
if config.is_encoder_decoder or config.pad_token_id is None:
self.skipTest(reason="Model is not an encoder-decoder model or has no set pad token id")
model = model_class(config)
# Make sure the model contains at least the full vocabulary size in its embedding matrix
self.assertGreaterEqual(model.config.vocab_size, len(tokenizer))
# Build sequence
first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
sequence = " ".join(first_ten_tokens)
encoded_sequence = tokenizer.encode_plus(sequence, return_tensors="tf")
batch_encoded_sequence = tokenizer.batch_encode_plus([sequence, sequence], return_tensors="tf")
# This should not fail
model(encoded_sequence)
model(batch_encoded_sequence)
# TODO: Check if require_torch is the best to test for numpy here ... Maybe move to require_flax when available
@require_torch
@slow

View File

@ -39,7 +39,6 @@ from transformers.testing_utils import (
CaptureStderr,
require_flax,
require_sentencepiece,
require_tf,
require_tokenizers,
require_torch,
slow,
@ -121,27 +120,6 @@ class TokenizerUtilsTest(unittest.TestCase):
tokenizer_r("Small example to encode", return_tensors=TensorType.NUMPY), np.array_equal
)
@require_tf
@require_tokenizers
def test_batch_encoding_pickle_tf(self):
import tensorflow as tf
def tf_array_equals(t1, t2):
return tf.reduce_all(tf.equal(t1, t2))
tokenizer_p = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
tokenizer_r = BertTokenizerFast.from_pretrained("google-bert/bert-base-cased")
with self.subTest("BatchEncoding (Python, return_tensors=TENSORFLOW)"):
self.assert_dump_and_restore(
tokenizer_p("Small example to encode", return_tensors=TensorType.TENSORFLOW), tf_array_equals
)
with self.subTest("BatchEncoding (Rust, return_tensors=TENSORFLOW)"):
self.assert_dump_and_restore(
tokenizer_r("Small example to encode", return_tensors=TensorType.TENSORFLOW), tf_array_equals
)
@require_torch
@require_tokenizers
def test_batch_encoding_pickle_pt(self):
@ -211,22 +189,6 @@ class TokenizerUtilsTest(unittest.TestCase):
self.assertEqual(tensor_batch["inputs"].shape, (1, 3))
self.assertEqual(tensor_batch["labels"].shape, (1,))
@require_tf
def test_batch_encoding_with_labels_tf(self):
batch = BatchEncoding({"inputs": [[1, 2, 3], [4, 5, 6]], "labels": [0, 1]})
tensor_batch = batch.convert_to_tensors(tensor_type="tf")
self.assertEqual(tensor_batch["inputs"].shape, (2, 3))
self.assertEqual(tensor_batch["labels"].shape, (2,))
# test converting the converted
with CaptureStderr() as cs:
tensor_batch = batch.convert_to_tensors(tensor_type="tf")
self.assertFalse(len(cs.err), msg=f"should have no warning, but got {cs.err}")
batch = BatchEncoding({"inputs": [1, 2, 3], "labels": 0})
tensor_batch = batch.convert_to_tensors(tensor_type="tf", prepend_batch_axis=True)
self.assertEqual(tensor_batch["inputs"].shape, (1, 3))
self.assertEqual(tensor_batch["labels"].shape, (1,))
@require_flax
def test_batch_encoding_with_labels_jax(self):
batch = BatchEncoding({"inputs": [[1, 2, 3], [4, 5, 6]], "labels": [0, 1]})
@ -381,20 +343,6 @@ class TokenizerUtilsTest(unittest.TestCase):
self.assertTrue(isinstance(batch["input_ids"], torch.Tensor))
self.assertEqual(batch["input_ids"].tolist(), [[0, 1, 2, tokenizer.pad_token_id], [0, 1, 2, 3]])
@require_tf
def test_padding_accepts_tensors_tf(self):
import tensorflow as tf
features = [{"input_ids": tf.constant([0, 1, 2])}, {"input_ids": tf.constant([0, 1, 2, 3])}]
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
batch = tokenizer.pad(features, padding=True)
self.assertTrue(isinstance(batch["input_ids"], tf.Tensor))
self.assertEqual(batch["input_ids"].numpy().tolist(), [[0, 1, 2, tokenizer.pad_token_id], [0, 1, 2, 3]])
batch = tokenizer.pad(features, padding=True, return_tensors="tf")
self.assertTrue(isinstance(batch["input_ids"], tf.Tensor))
self.assertEqual(batch["input_ids"].numpy().tolist(), [[0, 1, 2, tokenizer.pad_token_id], [0, 1, 2, 3]])
@require_tokenizers
def test_instantiation_from_tokenizers(self):
bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

View File

@ -29,20 +29,16 @@ from transformers import (
DataCollatorWithFlattening,
DataCollatorWithPadding,
default_data_collator,
is_tf_available,
is_torch_available,
set_seed,
)
from transformers.testing_utils import require_tf, require_torch
from transformers.testing_utils import require_torch
from transformers.utils import PaddingStrategy
if is_torch_available():
import torch
if is_tf_available():
import tensorflow as tf
@require_torch
class DataCollatorIntegrationTest(unittest.TestCase):
@ -1022,795 +1018,6 @@ class DataCollatorImmutabilityTest(unittest.TestCase):
)
@require_tf
class TFDataCollatorIntegrationTest(unittest.TestCase):
def setUp(self):
super().setUp()
self.tmpdirname = tempfile.mkdtemp()
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
self.vocab_file = os.path.join(self.tmpdirname, "vocab.txt")
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def test_default_with_dict(self):
features = [{"label": i, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
batch = default_data_collator(features, return_tensors="tf")
self.assertEqual(batch["labels"].numpy().tolist(), list(range(8)))
self.assertEqual(batch["labels"].dtype, tf.int64)
self.assertEqual(batch["inputs"].shape.as_list(), [8, 6])
# With label_ids
features = [{"label_ids": [0, 1, 2], "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
batch = default_data_collator(features, return_tensors="tf")
self.assertEqual(batch["labels"].numpy().tolist(), ([[0, 1, 2]] * 8))
self.assertEqual(batch["labels"].dtype, tf.int64)
self.assertEqual(batch["inputs"].shape.as_list(), [8, 6])
# Features can already be tensors
features = [{"label": i, "inputs": np.random.randint(0, 10, [10])} for i in range(8)]
batch = default_data_collator(features, return_tensors="tf")
self.assertEqual(batch["labels"].numpy().tolist(), (list(range(8))))
self.assertEqual(batch["labels"].dtype, tf.int64)
self.assertEqual(batch["inputs"].shape.as_list(), [8, 10])
# Labels can already be tensors
features = [{"label": np.array(i), "inputs": np.random.randint(0, 10, [10])} for i in range(8)]
batch = default_data_collator(features, return_tensors="tf")
self.assertEqual(batch["labels"].dtype, tf.int64)
self.assertEqual(batch["labels"].numpy().tolist(), list(range(8)))
self.assertEqual(batch["labels"].dtype, tf.int64)
self.assertEqual(batch["inputs"].shape.as_list(), [8, 10])
def test_numpy_dtype_preservation(self):
data_collator = default_data_collator
# Confirms that numpy inputs are handled correctly even when scalars
features = [{"input_ids": np.array([0, 1, 2, 3, 4]), "label": np.int64(i)} for i in range(4)]
batch = data_collator(features, return_tensors="tf")
self.assertEqual(batch["labels"].dtype, tf.int64)
def test_default_classification_and_regression(self):
data_collator = default_data_collator
features = [{"input_ids": [0, 1, 2, 3, 4], "label": i} for i in range(4)]
batch = data_collator(features, return_tensors="tf")
self.assertEqual(batch["labels"].dtype, tf.int64)
features = [{"input_ids": [0, 1, 2, 3, 4], "label": float(i)} for i in range(4)]
batch = data_collator(features, return_tensors="tf")
self.assertEqual(batch["labels"].dtype, tf.float32)
def test_default_with_no_labels(self):
features = [{"label": None, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
batch = default_data_collator(features, return_tensors="tf")
self.assertTrue("labels" not in batch)
self.assertEqual(batch["inputs"].shape.as_list(), [8, 6])
# With label_ids
features = [{"label_ids": None, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
batch = default_data_collator(features, return_tensors="tf")
self.assertTrue("labels" not in batch)
self.assertEqual(batch["inputs"].shape.as_list(), [8, 6])
def test_data_collator_with_padding(self):
tokenizer = BertTokenizer(self.vocab_file)
features = [{"input_ids": [0, 1, 2]}, {"input_ids": [0, 1, 2, 3, 4, 5]}]
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 6])
self.assertEqual(batch["input_ids"][0].numpy().tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3)
data_collator = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=10, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape, [2, 8])
def test_data_collator_for_token_classification(self):
tokenizer = BertTokenizer(self.vocab_file)
features = [
{"input_ids": [0, 1, 2], "labels": [0, 1, 2]},
{"input_ids": [0, 1, 2, 3, 4, 5], "labels": [0, 1, 2, 3, 4, 5]},
]
data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 6])
self.assertEqual(batch["input_ids"][0].numpy().tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3)
self.assertEqual(batch["labels"].shape.as_list(), [2, 6])
self.assertEqual(batch["labels"][0].numpy().tolist(), [0, 1, 2] + [-100] * 3)
data_collator = DataCollatorForTokenClassification(
tokenizer, padding="max_length", max_length=10, return_tensors="tf"
)
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 8])
self.assertEqual(batch["labels"].shape.as_list(), [2, 8])
data_collator = DataCollatorForTokenClassification(tokenizer, label_pad_token_id=-1, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 6])
self.assertEqual(batch["input_ids"][0].numpy().tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3)
self.assertEqual(batch["labels"].shape.as_list(), [2, 6])
self.assertEqual(batch["labels"][0].numpy().tolist(), [0, 1, 2] + [-1] * 3)
def test_data_collator_for_seq2seq(self):
def create_features():
return [
{"input_ids": list(range(3)), "labels": list(range(3))},
{"input_ids": list(range(6)), "labels": list(range(6))},
]
tokenizer = BertTokenizer(self.vocab_file)
features = create_features()
data_collator = DataCollatorForSeq2Seq(tokenizer, padding=PaddingStrategy.LONGEST, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 6])
self.assertEqual(batch["input_ids"][0].numpy().tolist(), list(range(3)) + [tokenizer.pad_token_id] * 3)
self.assertEqual(batch["input_ids"][1].numpy().tolist(), list(range(6)))
self.assertEqual(batch["labels"].shape.as_list(), [2, 6])
self.assertEqual(batch["labels"][0].numpy().tolist(), list(range(3)) + [-100] * 3)
self.assertEqual(batch["labels"][1].numpy().tolist(), list(range(6)))
data_collator = DataCollatorForSeq2Seq(
tokenizer, padding=PaddingStrategy.MAX_LENGTH, max_length=7, return_tensors="tf"
)
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 7])
self.assertEqual(batch["input_ids"][0].numpy().tolist(), list(range(3)) + [tokenizer.pad_token_id] * 4)
self.assertEqual(batch["input_ids"][1].numpy().tolist(), list(range(6)) + [tokenizer.pad_token_id] * 1)
self.assertEqual(batch["labels"].shape.as_list(), [2, 7])
self.assertEqual(batch["labels"][0].numpy().tolist(), list(range(3)) + [-100] * 4)
self.assertEqual(batch["labels"][1].numpy().tolist(), list(range(6)) + [-100] * 1)
data_collator = DataCollatorForSeq2Seq(tokenizer, padding=PaddingStrategy.DO_NOT_PAD, return_tensors="tf")
with self.assertRaises(ValueError):
# expects an error due to unequal shapes to create tensor
data_collator(features)
batch = data_collator([features[0], features[0]])
self.assertEqual(batch["input_ids"][0].numpy().tolist(), features[0]["input_ids"])
self.assertEqual(batch["input_ids"][1].numpy().tolist(), features[0]["input_ids"])
self.assertEqual(batch["labels"][0].numpy().tolist(), features[0]["labels"])
self.assertEqual(batch["labels"][1].numpy().tolist(), features[0]["labels"])
data_collator = DataCollatorForSeq2Seq(
tokenizer, padding=PaddingStrategy.LONGEST, pad_to_multiple_of=8, return_tensors="tf"
)
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 8])
self.assertEqual(batch["labels"].shape.as_list(), [2, 8])
# side effects on labels cause mismatch on longest strategy
features = create_features()
data_collator = DataCollatorForSeq2Seq(
tokenizer, padding=PaddingStrategy.LONGEST, label_pad_token_id=-1, return_tensors="tf"
)
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 6])
self.assertEqual(batch["input_ids"][0].numpy().tolist(), list(range(3)) + [tokenizer.pad_token_id] * 3)
self.assertEqual(batch["input_ids"][1].numpy().tolist(), list(range(6)))
self.assertEqual(batch["labels"].shape.as_list(), [2, 6])
self.assertEqual(batch["labels"][0].numpy().tolist(), list(range(3)) + [-1] * 3)
self.assertEqual(batch["labels"][1].numpy().tolist(), list(range(6)))
for feature in features:
feature.pop("labels")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 6])
self.assertEqual(batch["input_ids"][0].numpy().tolist(), list(range(3)) + [tokenizer.pad_token_id] * 3)
def _test_no_pad_and_pad(self, no_pad_features, pad_features):
tokenizer = BertTokenizer(self.vocab_file)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf")
batch = data_collator(no_pad_features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
batch = data_collator(pad_features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
data_collator = DataCollatorForLanguageModeling(
tokenizer, mlm=False, pad_to_multiple_of=8, return_tensors="tf"
)
batch = data_collator(no_pad_features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 16])
self.assertEqual(batch["labels"].shape.as_list(), [2, 16])
batch = data_collator(pad_features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 16])
self.assertEqual(batch["labels"].shape.as_list(), [2, 16])
tokenizer.pad_token = None
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf")
with self.assertRaises(ValueError):
# Expect error due to padding token missing
data_collator(pad_features)
set_seed(42) # For reproducibility
tokenizer = BertTokenizer(self.vocab_file)
data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="tf")
batch = data_collator(no_pad_features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
self.assertTrue(tf.reduce_any(masked_tokens))
# self.assertTrue(all(x == -100 for x in batch["labels"].numpy()[~masked_tokens.numpy()].tolist()))
batch = data_collator(pad_features, return_tensors="tf")
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
self.assertTrue(tf.reduce_any(masked_tokens))
# self.assertTrue(all(x == -100 for x in batch["labels"].numpy()[~masked_tokens.numpy()].tolist()))
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="tf")
batch = data_collator(no_pad_features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 16])
self.assertEqual(batch["labels"].shape.as_list(), [2, 16])
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
self.assertTrue(tf.reduce_any(masked_tokens))
# self.assertTrue(all(x == -100 for x in batch["labels"].numpy()[~masked_tokens.numpy()].tolist()))
batch = data_collator(pad_features, return_tensors="tf")
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 16])
self.assertEqual(batch["labels"].shape.as_list(), [2, 16])
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
self.assertTrue(tf.reduce_any(masked_tokens))
# self.assertTrue(all(x == -100 for x in batch["labels"].numpy()[~masked_tokens.numpy()].tolist()))
def test_probability_sum_error(self):
"""Test that the sum of mask_replace_prob and random_replace_prob exceeding 1 raises an error."""
tokenizer = BertTokenizer(self.vocab_file)
with self.assertRaises(ValueError):
DataCollatorForLanguageModeling(tokenizer=tokenizer, mask_replace_prob=0.9, random_replace_prob=0.2)
def test_all_mask_replacement(self):
"""Test behavior when mask_replace_prob=1."""
tokenizer = BertTokenizer(self.vocab_file)
# pytorch call
collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mask_replace_prob=1, random_replace_prob=0, return_tensors="pt"
)
inputs = torch.tensor([0, 1, 2, 3, 4, 5])
features = [{"input_ids": inputs} for _ in range(8)]
batch = collator(features)
# confirm that every token is either the original token or [MASK]
self.assertTrue(torch.all((batch["input_ids"] == inputs) | (batch["input_ids"] == tokenizer.mask_token_id)))
# tf call
collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mask_replace_prob=1, random_replace_prob=0, return_tensors="tf"
)
inputs = tf.constant([0, 1, 2, 3, 4, 5])
features = [{"input_ids": inputs} for _ in range(8)]
batch = collator(features)
# confirm that every token is either the original token or [MASK]
self.assertTrue(
tf.reduce_all(
(batch["input_ids"] == tf.cast(inputs, tf.int64)) | (batch["input_ids"] == tokenizer.mask_token_id)
)
)
# numpy call
collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mask_replace_prob=1, random_replace_prob=0, return_tensors="np"
)
inputs = np.array([0, 1, 2, 3, 4, 5])
features = [{"input_ids": inputs} for _ in range(8)]
batch = collator(features)
# confirm that every token is either the original token or [MASK]
self.assertTrue(np.all((batch["input_ids"] == inputs) | (batch["input_ids"] == tokenizer.mask_token_id)))
def test_data_collator_for_language_modeling(self):
no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
self._test_no_pad_and_pad(no_pad_features, pad_features)
no_pad_features = [list(range(10)), list(range(10))]
pad_features = [list(range(5)), list(range(10))]
self._test_no_pad_and_pad(no_pad_features, pad_features)
def test_data_collator_for_language_modeling_with_seed(self):
tokenizer = BertTokenizer(self.vocab_file)
features = [{"input_ids": list(range(1000))}, {"input_ids": list(range(1000))}]
# check if seed is respected between two different DataCollatorForLanguageModeling instances
data_collator = DataCollatorForLanguageModeling(tokenizer, seed=42, return_tensors="tf")
batch_1 = data_collator(features)
self.assertEqual(batch_1["input_ids"].shape.as_list(), [2, 1000])
self.assertEqual(batch_1["labels"].shape.as_list(), [2, 1000])
data_collator = DataCollatorForLanguageModeling(tokenizer, seed=42, return_tensors="tf")
batch_2 = data_collator(features)
self.assertEqual(batch_2["input_ids"].shape.as_list(), [2, 1000])
self.assertEqual(batch_2["labels"].shape.as_list(), [2, 1000])
self.assertTrue(np.all(batch_1["input_ids"] == batch_2["input_ids"]))
self.assertTrue(np.all(batch_1["labels"] == batch_2["labels"]))
# try with different seed
data_collator = DataCollatorForLanguageModeling(tokenizer, seed=43, return_tensors="tf")
batch_3 = data_collator(features)
self.assertEqual(batch_3["input_ids"].shape.as_list(), [2, 1000])
self.assertEqual(batch_3["labels"].shape.as_list(), [2, 1000])
self.assertFalse(np.all(batch_1["input_ids"] == batch_3["input_ids"]))
self.assertFalse(np.all(batch_1["labels"] == batch_3["labels"]))
def test_data_collator_for_whole_word_mask(self):
tokenizer = BertTokenizer(self.vocab_file)
data_collator = DataCollatorForWholeWordMask(tokenizer, return_tensors="tf")
features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
# Features can already be tensors
features = [{"input_ids": np.arange(10)}, {"input_ids": np.arange(10)}]
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
def test_data_collator_for_whole_word_mask_with_seed(self):
tokenizer = BertTokenizer(self.vocab_file)
features = [{"input_ids": list(range(1000))}, {"input_ids": list(range(1000))}]
# check if seed is respected between two different DataCollatorForWholeWordMask instances
data_collator = DataCollatorForWholeWordMask(tokenizer, seed=42, return_tensors="tf")
batch_1 = data_collator(features)
self.assertEqual(batch_1["input_ids"].shape.as_list(), [2, 1000])
self.assertEqual(batch_1["labels"].shape.as_list(), [2, 1000])
data_collator = DataCollatorForWholeWordMask(tokenizer, seed=42, return_tensors="tf")
batch_2 = data_collator(features)
self.assertEqual(batch_2["input_ids"].shape.as_list(), [2, 1000])
self.assertEqual(batch_2["labels"].shape.as_list(), [2, 1000])
self.assertTrue(np.all(batch_1["input_ids"] == batch_2["input_ids"]))
self.assertTrue(np.all(batch_1["labels"] == batch_2["labels"]))
# try with different seed
data_collator = DataCollatorForWholeWordMask(tokenizer, seed=43, return_tensors="tf")
batch_3 = data_collator(features)
self.assertEqual(batch_3["input_ids"].shape.as_list(), [2, 1000])
self.assertEqual(batch_3["labels"].shape.as_list(), [2, 1000])
self.assertFalse(np.all(batch_1["input_ids"] == batch_3["input_ids"]))
self.assertFalse(np.all(batch_1["labels"] == batch_3["labels"]))
def test_plm(self):
tokenizer = BertTokenizer(self.vocab_file)
no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
data_collator = DataCollatorForPermutationLanguageModeling(tokenizer, return_tensors="tf")
batch = data_collator(pad_features)
self.assertIsInstance(batch, dict)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["perm_mask"].shape.as_list(), [2, 10, 10])
self.assertEqual(batch["target_mapping"].shape.as_list(), [2, 10, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
batch = data_collator(no_pad_features)
self.assertIsInstance(batch, dict)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["perm_mask"].shape.as_list(), [2, 10, 10])
self.assertEqual(batch["target_mapping"].shape.as_list(), [2, 10, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
example = [np.random.randint(0, 5, [5])]
with self.assertRaises(ValueError):
# Expect error due to odd sequence length
data_collator(example)
def test_nsp(self):
tokenizer = BertTokenizer(self.vocab_file)
features = [
{"input_ids": [0, 1, 2, 3, 4], "token_type_ids": [0, 1, 2, 3, 4], "next_sentence_label": i}
for i in range(2)
]
data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 5])
self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 5])
self.assertEqual(batch["labels"].shape.as_list(), [2, 5])
self.assertEqual(batch["next_sentence_label"].shape.as_list(), [2])
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 8])
self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 8])
self.assertEqual(batch["labels"].shape.as_list(), [2, 8])
self.assertEqual(batch["next_sentence_label"].shape.as_list(), [2])
def test_sop(self):
tokenizer = BertTokenizer(self.vocab_file)
features = [
{
"input_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]),
"token_type_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]),
"sentence_order_label": i,
}
for i in range(2)
]
data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 5])
self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 5])
self.assertEqual(batch["labels"].shape.as_list(), [2, 5])
self.assertEqual(batch["sentence_order_label"].shape.as_list(), [2])
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 8])
self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 8])
self.assertEqual(batch["labels"].shape.as_list(), [2, 8])
self.assertEqual(batch["sentence_order_label"].shape.as_list(), [2])
@require_tf
class TFDataCollatorImmutabilityTest(unittest.TestCase):
def setUp(self):
self.tmpdirname = tempfile.mkdtemp()
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
self.vocab_file = os.path.join(self.tmpdirname, "vocab.txt")
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def _turn_to_none(self, item):
"""used to convert `item` to `None` type"""
return None
def _validate_original_data_against_collated_data(self, collator, original_data, batch_data):
# we only care about side effects, the results are tested elsewhere
collator(batch_data)
# we go through every item and convert to `primitive` datatypes if necessary
# then compares for equivalence for the original data and the data that has been passed through the collator
for original, batch in zip(original_data, batch_data):
for original_val, batch_val in zip(original.values(), batch.values()):
if isinstance(original_val, np.ndarray):
self.assertEqual(original_val.tolist(), batch_val.tolist())
elif isinstance(original_val, tf.Tensor):
self.assertEqual(original_val.numpy().tolist(), batch_val.numpy().tolist())
else:
self.assertEqual(original_val, batch_val)
def _validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
self, collator, base_data, input_key, input_datatype, label_key, label_datatype, ignore_label=False
):
# using the arguments to recreate the features with their respective (potentially new) datatypes
features_original = [
{label_key: label_datatype(sample[label_key]), input_key: input_datatype(sample[input_key])}
for sample in base_data
]
features_batch = [
{label_key: label_datatype(sample[label_key]), input_key: input_datatype(sample[input_key])}
for sample in base_data
]
# some collators do not use labels, or sometimes we want to check if the collator with labels can handle such cases
if ignore_label:
for original, batch in zip(features_original, features_batch):
original.pop(label_key)
batch.pop(label_key)
self._validate_original_data_against_collated_data(
collator=collator, original_data=features_original, batch_data=features_batch
)
def test_default_collator_immutability(self):
features_base_single_label = [{"label": i, "inputs": (0, 1, 2, 3, 4, 5)} for i in range(4)]
features_base_multiple_labels = [{"label": (0, 1, 2), "inputs": (0, 1, 2, 3, 4, 5)} for i in range(4)]
for datatype_input, datatype_label in [
(list, int),
(list, float),
(np.array, int),
(np.array, tf.constant),
(list, self._turn_to_none),
]:
self._validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
collator=lambda x: default_data_collator(x, return_tensors="tf"),
base_data=features_base_single_label,
input_key="inputs",
input_datatype=datatype_input,
label_key="label",
label_datatype=datatype_label,
)
for datatype_input, datatype_label in [(list, list), (list, self._turn_to_none)]:
self._validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
collator=lambda x: default_data_collator(x, return_tensors="tf"),
base_data=features_base_multiple_labels,
input_key="inputs",
input_datatype=datatype_input,
label_key="label",
label_datatype=datatype_label,
)
features_base_single_label_alt = [{"input_ids": (0, 1, 2, 3, 4), "label": float(i)} for i in range(4)]
self._validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
collator=lambda x: default_data_collator(x, return_tensors="tf"),
base_data=features_base_single_label_alt,
input_key="input_ids",
input_datatype=list,
label_key="label",
label_datatype=float,
)
def test_with_padding_collator_immutability(self):
tokenizer = BertTokenizer(self.vocab_file)
features_original = [{"input_ids": [0, 1, 2]}, {"input_ids": [0, 1, 2, 3, 4, 5]}]
features_batch = [{"input_ids": [0, 1, 2]}, {"input_ids": [0, 1, 2, 3, 4, 5]}]
data_collator = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=10, return_tensors="tf")
self._validate_original_data_against_collated_data(
collator=data_collator, original_data=features_original, batch_data=features_batch
)
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8, return_tensors="tf")
self._validate_original_data_against_collated_data(
collator=data_collator, original_data=features_original, batch_data=features_batch
)
def test_for_token_classification_collator_immutability(self):
tokenizer = BertTokenizer(self.vocab_file)
features_base = [
{"input_ids": (0, 1, 2), "labels": (0, 1, 2)},
{"input_ids": (0, 1, 2, 3, 4, 5), "labels": (0, 1, 2, 3, 4, 5)},
]
token_classification_collators = [
DataCollatorForTokenClassification(tokenizer, return_tensors="tf"),
DataCollatorForTokenClassification(tokenizer, padding="max_length", max_length=10, return_tensors="tf"),
DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8, return_tensors="tf"),
DataCollatorForTokenClassification(tokenizer, label_pad_token_id=-1, return_tensors="tf"),
]
for datatype_input, datatype_label in [(list, list)]:
for collator in token_classification_collators:
self._validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
collator=collator,
base_data=features_base,
input_key="input_ids",
input_datatype=datatype_input,
label_key="labels",
label_datatype=datatype_label,
)
self._validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
collator=token_classification_collators[-1],
base_data=features_base,
input_key="input_ids",
input_datatype=datatype_input,
label_key="labels",
label_datatype=datatype_label,
ignore_label=True,
)
def test_seq2seq_collator_immutability(self):
tokenizer = BertTokenizer(self.vocab_file)
features_base = [
{"input_ids": list(range(3)), "labels": list(range(3))},
{"input_ids": list(range(6)), "labels": list(range(6))},
]
seq2seq_collators = [
DataCollatorForSeq2Seq(tokenizer, padding=PaddingStrategy.LONGEST, return_tensors="tf"),
DataCollatorForSeq2Seq(tokenizer, padding=PaddingStrategy.MAX_LENGTH, max_length=7, return_tensors="tf"),
DataCollatorForSeq2Seq(
tokenizer, padding=PaddingStrategy.LONGEST, pad_to_multiple_of=8, return_tensors="tf"
),
DataCollatorForSeq2Seq(
tokenizer, padding=PaddingStrategy.LONGEST, label_pad_token_id=-1, return_tensors="tf"
),
]
for datatype_input, datatype_label in [(list, list)]:
for collator in seq2seq_collators:
self._validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
collator=collator,
base_data=features_base,
input_key="input_ids",
input_datatype=datatype_input,
label_key="labels",
label_datatype=datatype_label,
)
self._validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
collator=seq2seq_collators[-1],
base_data=features_base,
input_key="input_ids",
input_datatype=datatype_input,
label_key="labels",
label_datatype=datatype_label,
ignore_label=True,
)
features_base_no_pad = [
{"input_ids": list(range(3)), "labels": list(range(3))},
{"input_ids": list(range(3)), "labels": list(range(3))},
]
seq2seq_no_padding_collator = DataCollatorForSeq2Seq(
tokenizer, padding=PaddingStrategy.DO_NOT_PAD, return_tensors="tf"
)
for datatype_input, datatype_label in [(list, list)]:
self._validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
collator=seq2seq_no_padding_collator,
base_data=features_base_no_pad,
input_key="input_ids",
input_datatype=datatype_input,
label_key="labels",
label_datatype=datatype_label,
)
def test_language_modelling_collator_immutability(self):
tokenizer = BertTokenizer(self.vocab_file)
features_base_no_pad = [
{"input_ids": tuple(range(10)), "labels": (1,)},
{"input_ids": tuple(range(10)), "labels": (1,)},
]
features_base_pad = [
{"input_ids": tuple(range(5)), "labels": (1,)},
{"input_ids": tuple(range(5)), "labels": (1,)},
]
lm_collators = [
DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf"),
DataCollatorForLanguageModeling(tokenizer, mlm=False, pad_to_multiple_of=8, return_tensors="tf"),
DataCollatorForLanguageModeling(tokenizer, return_tensors="tf"),
DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="tf"),
]
for datatype_input, datatype_label in [(list, list)]:
for collator in lm_collators:
self._validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
collator=collator,
base_data=features_base_no_pad,
input_key="input_ids",
input_datatype=datatype_input,
label_key="labels",
label_datatype=datatype_label,
ignore_label=True,
)
self._validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
collator=collator,
base_data=features_base_pad,
input_key="input_ids",
input_datatype=datatype_input,
label_key="labels",
label_datatype=datatype_label,
ignore_label=True,
)
def test_whole_world_masking_collator_immutability(self):
tokenizer = BertTokenizer(self.vocab_file)
features_base = [
{"input_ids": list(range(10)), "labels": (1,)},
{"input_ids": list(range(10)), "labels": (1,)},
]
whole_word_masking_collator = DataCollatorForWholeWordMask(tokenizer, return_tensors="tf")
for datatype_input, datatype_label in [(list, list), (np.array, np.array)]:
self._validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
collator=whole_word_masking_collator,
base_data=features_base,
input_key="input_ids",
input_datatype=datatype_input,
label_key="labels",
label_datatype=datatype_label,
ignore_label=True,
)
def test_permutation_language_modelling_collator_immutability(self):
tokenizer = BertTokenizer(self.vocab_file)
plm_collator = DataCollatorForPermutationLanguageModeling(tokenizer, return_tensors="tf")
no_pad_features_original = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
no_pad_features_batch = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
self._validate_original_data_against_collated_data(
collator=plm_collator, original_data=no_pad_features_original, batch_data=no_pad_features_batch
)
pad_features_original = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
pad_features_batch = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
self._validate_original_data_against_collated_data(
collator=plm_collator, original_data=pad_features_original, batch_data=pad_features_batch
)
def test_next_sentence_prediction_collator_immutability(self):
tokenizer = BertTokenizer(self.vocab_file)
features_original = [
{"input_ids": [0, 1, 2, 3, 4], "token_type_ids": [0, 1, 2, 3, 4], "next_sentence_label": i}
for i in range(2)
]
features_batch = [
{"input_ids": [0, 1, 2, 3, 4], "token_type_ids": [0, 1, 2, 3, 4], "next_sentence_label": i}
for i in range(2)
]
nsp_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="tf")
self._validate_original_data_against_collated_data(
collator=nsp_collator, original_data=features_original, batch_data=features_batch
)
nsp_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="tf")
self._validate_original_data_against_collated_data(
collator=nsp_collator, original_data=features_original, batch_data=features_batch
)
def test_sentence_order_prediction_collator_immutability(self):
tokenizer = BertTokenizer(self.vocab_file)
features_original = [
{
"input_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]),
"token_type_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]),
"sentence_order_label": i,
}
for i in range(2)
]
features_batch = [
{
"input_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]),
"token_type_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]),
"sentence_order_label": i,
}
for i in range(2)
]
sop_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="tf")
self._validate_original_data_against_collated_data(
collator=sop_collator, original_data=features_original, batch_data=features_batch
)
sop_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="tf")
self._validate_original_data_against_collated_data(
collator=sop_collator, original_data=features_original, batch_data=features_batch
)
class NumpyDataCollatorIntegrationTest(unittest.TestCase):
def setUp(self):
self.tmpdirname = tempfile.mkdtemp()

View File

@ -1,60 +0,0 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers import is_tf_available
from transformers.testing_utils import require_tf
if is_tf_available():
import tensorflow as tf
from transformers.activations_tf import get_tf_activation
@require_tf
class TestTFActivations(unittest.TestCase):
def test_gelu_10(self):
x = tf.constant([-100, -1.0, -0.1, 0, 0.1, 1.0, 100.0])
gelu = get_tf_activation("gelu")
gelu10 = get_tf_activation("gelu_10")
y_gelu = gelu(x)
y_gelu_10 = gelu10(x)
clipped_mask = tf.where(y_gelu_10 < 10.0, 1.0, 0.0)
self.assertEqual(tf.math.reduce_max(y_gelu_10).numpy().item(), 10.0)
self.assertTrue(np.allclose(y_gelu * clipped_mask, y_gelu_10 * clipped_mask))
def test_get_activation(self):
get_tf_activation("gelu")
get_tf_activation("gelu_10")
get_tf_activation("gelu_fast")
get_tf_activation("gelu_new")
get_tf_activation("glu")
get_tf_activation("mish")
get_tf_activation("quick_gelu")
get_tf_activation("relu")
get_tf_activation("sigmoid")
get_tf_activation("silu")
get_tf_activation("swish")
get_tf_activation("tanh")
with self.assertRaises(KeyError):
get_tf_activation("bogus")
with self.assertRaises(KeyError):
get_tf_activation(None)

View File

@ -36,7 +36,7 @@ from transformers.commands.add_new_model_like import (
retrieve_model_classes,
simplify_replacements,
)
from transformers.testing_utils import require_flax, require_tf, require_torch
from transformers.testing_utils import require_flax, require_torch
BERT_MODEL_FILES = {
@ -84,7 +84,6 @@ REPO_PATH = Path(transformers.__path__[0]).parent.parent
@require_torch
@require_tf
@require_flax
class TestAddNewModelLike(unittest.TestCase):
def init_file(self, file_name, content):

View File

@ -19,7 +19,7 @@ from pathlib import Path
from typing import Union
import transformers
from transformers.testing_utils import require_tf, require_torch, slow
from transformers.testing_utils import require_torch, slow
logger = logging.getLogger()
@ -27,7 +27,6 @@ logger = logging.getLogger()
@unittest.skip(reason="Temporarily disable the doc tests.")
@require_torch
@require_tf
@slow
class TestCodeExamples(unittest.TestCase):
def analyze_directory(

View File

@ -21,16 +21,13 @@ import transformers
# Try to import everything from transformers to ensure every object can be loaded.
from transformers import * # noqa F406
from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, require_flax, require_tf, require_torch
from transformers.utils import ContextManagers, find_labels, is_flax_available, is_tf_available, is_torch_available
from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, require_flax, require_torch
from transformers.utils import ContextManagers, find_labels, is_flax_available, is_torch_available
if is_torch_available():
from transformers import BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification
if is_tf_available():
from transformers import TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification
if is_flax_available():
from transformers import FlaxBertForPreTraining, FlaxBertForQuestionAnswering, FlaxBertForSequenceClassification
@ -107,18 +104,6 @@ class GenericUtilTests(unittest.TestCase):
self.assertEqual(find_labels(DummyModel), ["labels"])
@require_tf
def test_find_labels_tf(self):
self.assertEqual(find_labels(TFBertForSequenceClassification), ["labels"])
self.assertEqual(find_labels(TFBertForPreTraining), ["labels", "next_sentence_label"])
self.assertEqual(find_labels(TFBertForQuestionAnswering), ["start_positions", "end_positions"])
# find_labels works regardless of the class name (it detects the framework through inheritance)
class DummyModel(TFBertForSequenceClassification):
pass
self.assertEqual(find_labels(DummyModel), ["labels"])
@require_flax
def test_find_labels_flax(self):
# Flax models don't have labels

View File

@ -19,14 +19,13 @@ import numpy as np
from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_outputs import BaseModelOutput
from transformers.testing_utils import require_flax, require_tf, require_torch
from transformers.testing_utils import require_flax, require_torch
from transformers.utils import (
can_return_tuple,
expand_dims,
filter_out_non_signature_kwargs,
flatten_dict,
is_flax_available,
is_tf_available,
is_torch_available,
reshape,
squeeze,
@ -38,9 +37,6 @@ from transformers.utils import (
if is_flax_available():
import jax.numpy as jnp
if is_tf_available():
import tensorflow as tf
if is_torch_available():
import torch
@ -88,16 +84,6 @@ class GenericTester(unittest.TestCase):
t = torch.tensor(x)
self.assertTrue(np.allclose(transpose(x, axes=(1, 2, 0)), transpose(t, axes=(1, 2, 0)).numpy()))
@require_tf
def test_transpose_tf(self):
x = np.random.randn(3, 4)
t = tf.constant(x)
self.assertTrue(np.allclose(transpose(x), transpose(t).numpy()))
x = np.random.randn(3, 4, 5)
t = tf.constant(x)
self.assertTrue(np.allclose(transpose(x, axes=(1, 2, 0)), transpose(t, axes=(1, 2, 0)).numpy()))
@require_flax
def test_transpose_flax(self):
x = np.random.randn(3, 4)
@ -125,16 +111,6 @@ class GenericTester(unittest.TestCase):
t = torch.tensor(x)
self.assertTrue(np.allclose(reshape(x, (12, 5)), reshape(t, (12, 5)).numpy()))
@require_tf
def test_reshape_tf(self):
x = np.random.randn(3, 4)
t = tf.constant(x)
self.assertTrue(np.allclose(reshape(x, (4, 3)), reshape(t, (4, 3)).numpy()))
x = np.random.randn(3, 4, 5)
t = tf.constant(x)
self.assertTrue(np.allclose(reshape(x, (12, 5)), reshape(t, (12, 5)).numpy()))
@require_flax
def test_reshape_flax(self):
x = np.random.randn(3, 4)
@ -162,16 +138,6 @@ class GenericTester(unittest.TestCase):
t = torch.tensor(x)
self.assertTrue(np.allclose(squeeze(x, axis=2), squeeze(t, axis=2).numpy()))
@require_tf
def test_squeeze_tf(self):
x = np.random.randn(1, 3, 4)
t = tf.constant(x)
self.assertTrue(np.allclose(squeeze(x), squeeze(t).numpy()))
x = np.random.randn(1, 4, 1, 5)
t = tf.constant(x)
self.assertTrue(np.allclose(squeeze(x, axis=2), squeeze(t, axis=2).numpy()))
@require_flax
def test_squeeze_flax(self):
x = np.random.randn(1, 3, 4)
@ -192,12 +158,6 @@ class GenericTester(unittest.TestCase):
t = torch.tensor(x)
self.assertTrue(np.allclose(expand_dims(x, axis=1), expand_dims(t, axis=1).numpy()))
@require_tf
def test_expand_dims_tf(self):
x = np.random.randn(3, 4)
t = tf.constant(x)
self.assertTrue(np.allclose(expand_dims(x, axis=1), expand_dims(t, axis=1).numpy()))
@require_flax
def test_expand_dims_flax(self):
x = np.random.randn(3, 4)
@ -232,18 +192,6 @@ class GenericTester(unittest.TestCase):
self.assertTrue(to_py_obj([t1, t2]) == [x1, x2])
@require_tf
def test_to_py_obj_tf(self):
x1 = [[1, 2, 3], [4, 5, 6]]
t1 = tf.constant(x1)
self.assertTrue(to_py_obj(t1) == x1)
x2 = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
t2 = tf.constant(x2)
self.assertTrue(to_py_obj(t2) == x2)
self.assertTrue(to_py_obj([t1, t2]) == [x1, x2])
@require_flax
def test_to_py_obj_flax(self):
x1 = [[1, 2, 3], [4, 5, 6]]
@ -256,25 +204,6 @@ class GenericTester(unittest.TestCase):
self.assertTrue(to_py_obj([t1, t2]) == [x1, x2])
@require_torch
@require_tf
@require_flax
def test_to_py_obj_mixed(self):
x1 = [[1], [2]]
t1 = np.array(x1)
x2 = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
t2 = torch.tensor(x2)
x3 = [1, 2, 3]
t3 = tf.constant(x3)
x4 = [[[1.0, 2.0]]]
t4 = jnp.array(x4)
mixed = [(t1, t2), (t3, t4)]
self.assertTrue(to_py_obj(mixed) == [[x1, x2], [x3, x4]])
class ValidationDecoratorTester(unittest.TestCase):
def test_cases_no_warning(self):

View File

@ -61,7 +61,6 @@ from transformers.testing_utils import (
require_non_hpu,
require_read_token,
require_safetensors,
require_tf,
require_torch,
require_torch_accelerator,
require_torch_multi_accelerator,
@ -79,7 +78,6 @@ from transformers.utils.import_utils import (
is_flash_attn_2_available,
is_flash_attn_3_available,
is_flax_available,
is_tf_available,
is_torch_npu_available,
is_torch_sdpa_available,
)
@ -322,9 +320,6 @@ class TestModelGammaBeta(PreTrainedModel):
if is_flax_available():
from transformers import FlaxBertModel
if is_tf_available():
from transformers import TFBertModel
TINY_T5 = "patrickvonplaten/t5-tiny-random"
TINY_BERT_FOR_TOKEN_CLASSIFICATION = "hf-internal-testing/tiny-bert-for-token-classification"
@ -1535,27 +1530,6 @@ class ModelUtilsTest(TestCasePlus):
for p1, p2 in zip(hub_model.parameters(), new_model.parameters()):
self.assertTrue(torch.equal(p1, p2))
@require_tf
@require_safetensors
def test_safetensors_torch_from_tf(self):
hub_model = BertModel.from_pretrained("hf-internal-testing/tiny-bert-pt-only")
model = TFBertModel.from_pretrained("hf-internal-testing/tiny-bert-tf-only")
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir, safe_serialization=True)
new_model = BertModel.from_pretrained(tmp_dir)
for p1, p2 in zip(hub_model.parameters(), new_model.parameters()):
self.assertTrue(torch.equal(p1, p2))
@require_tf
def test_torch_from_tf(self):
model = TFBertModel.from_pretrained("hf-internal-testing/tiny-bert-tf-only")
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir)
_ = BertModel.from_pretrained(tmp_dir, from_tf=True)
@require_safetensors
def test_safetensors_torch_from_torch_sharded(self):
model = BertModel.from_pretrained("hf-internal-testing/tiny-bert-pt-only")