Merge branch 'main' into add-owlv2-fast-processor

This commit is contained in:
lmarshall12 2025-06-25 19:16:59 +01:00 committed by GitHub
commit be55fff230
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
44 changed files with 21 additions and 2504 deletions

View File

@ -473,13 +473,6 @@ Hier ist zum Beispiel ein Test, der nur ausgeführt werden muss, wenn 2 oder meh
def test_example_with_multi_gpu(): def test_example_with_multi_gpu():
``` ```
Wenn ein Test `tensorflow` benötigt, verwenden Sie den Dekorator `require_tf`. Zum Beispiel:
```python no-style
@require_tf
def test_tf_thing_with_tensorflow():
```
Diese Dekors können gestapelt werden. Wenn zum Beispiel ein Test langsam ist und mindestens eine GPU unter pytorch benötigt, können Sie Diese Dekors können gestapelt werden. Wenn zum Beispiel ein Test langsam ist und mindestens eine GPU unter pytorch benötigt, können Sie
wie Sie ihn einrichten können: wie Sie ihn einrichten können:
@ -1204,9 +1197,6 @@ if torch.cuda.is_available():
import numpy as np import numpy as np
np.random.seed(seed) np.random.seed(seed)
# tf RNG
tf.random.set_seed(seed)
``` ```
### Tests debuggen ### Tests debuggen

View File

@ -474,13 +474,6 @@ For example, here is a test that must be run only when there are 2 or more GPUs
def test_example_with_multi_gpu(): def test_example_with_multi_gpu():
``` ```
If a test requires `tensorflow` use the `require_tf` decorator. For example:
```python no-style
@require_tf
def test_tf_thing_with_tensorflow():
```
These decorators can be stacked. For example, if a test is slow and requires at least one GPU under pytorch, here is These decorators can be stacked. For example, if a test is slow and requires at least one GPU under pytorch, here is
how to set it up: how to set it up:
@ -1226,11 +1219,6 @@ if torch.cuda.is_available():
import numpy as np import numpy as np
np.random.seed(seed) np.random.seed(seed)
# tf RNG
import tensorflow as tf
tf.random.set_seed(seed)
``` ```
### Debugging tests ### Debugging tests

View File

@ -445,13 +445,6 @@ CUDA_VISIBLE_DEVICES="1" pytest tests/utils/test_logging.py
def test_example_with_multi_gpu(): def test_example_with_multi_gpu():
``` ```
テストに `tensorflow` が必要な場合は、`require_tf` デコレータを使用します。例えば:
```python no-style
@require_tf
def test_tf_thing_with_tensorflow():
```
これらのデコレータは積み重ねることができます。たとえば、テストが遅く、pytorch で少なくとも 1 つの GPU が必要な場合は、次のようになります。 これらのデコレータは積み重ねることができます。たとえば、テストが遅く、pytorch で少なくとも 1 つの GPU が必要な場合は、次のようになります。
設定方法: 設定方法:
@ -1135,9 +1128,6 @@ if torch.cuda.is_available():
import numpy as np import numpy as np
np.random.seed(seed) np.random.seed(seed)
# tf RNG
tf.random.set_seed(seed)
``` ```

View File

@ -473,13 +473,6 @@ GPU 요구 사항을 표로 정리하면 아래와 같습니디ㅏ:
def test_example_with_multi_gpu(): def test_example_with_multi_gpu():
``` ```
`tensorflow`가 필요한 경우 `require_tf` 데코레이터를 사용합니다. 예를 들어 다음과 같습니다:
```python no-style
@require_tf
def test_tf_thing_with_tensorflow():
```
이러한 데코레이터는 중첩될 수 있습니다. 이러한 데코레이터는 중첩될 수 있습니다.
예를 들어, 느린 테스트로 진행되고 pytorch에서 적어도 하나의 GPU가 필요한 경우 다음과 같이 설정할 수 있습니다: 예를 들어, 느린 테스트로 진행되고 pytorch에서 적어도 하나의 GPU가 필요한 경우 다음과 같이 설정할 수 있습니다:

View File

@ -705,6 +705,9 @@ def require_tf(test_case):
""" """
Decorator marking a test that requires TensorFlow. These tests are skipped when TensorFlow isn't installed. Decorator marking a test that requires TensorFlow. These tests are skipped when TensorFlow isn't installed.
""" """
logger.warning_once(
"TensorFlow test-related code, including `require_tf`, is deprecated and will be removed in Transformers v4.55"
)
return unittest.skipUnless(is_tf_available(), "test requires TensorFlow")(test_case) return unittest.skipUnless(is_tf_available(), "test requires TensorFlow")(test_case)

View File

@ -1,106 +0,0 @@
import unittest
from pathlib import Path
from tempfile import TemporaryDirectory
from transformers import AutoConfig, TFAutoModel, is_tensorflow_text_available, is_tf_available
from transformers.models.bert.tokenization_bert import BertTokenizer
from transformers.testing_utils import require_tensorflow_text, require_tf, slow
if is_tf_available():
import tensorflow as tf
from transformers.modeling_tf_utils import keras
if is_tensorflow_text_available():
from transformers.models.bert import TFBertTokenizer
TOKENIZER_CHECKPOINTS = ["google-bert/bert-base-uncased", "google-bert/bert-base-cased"]
TINY_MODEL_CHECKPOINT = "hf-internal-testing/tiny-bert-tf-only"
if is_tf_available():
from transformers.modeling_tf_utils import keras
class ModelToSave(keras.Model):
def __init__(self, tokenizer):
super().__init__()
self.tokenizer = tokenizer
config = AutoConfig.from_pretrained(TINY_MODEL_CHECKPOINT)
self.bert = TFAutoModel.from_config(config)
def call(self, inputs):
tokenized = self.tokenizer(inputs)
out = self.bert(tokenized)
return out["pooler_output"]
@require_tf
@require_tensorflow_text
class BertTokenizationTest(unittest.TestCase):
# The TF tokenizers are usually going to be used as pretrained tokenizers from existing model checkpoints,
# so that's what we focus on here.
def setUp(self):
super().setUp()
self.tokenizers = [BertTokenizer.from_pretrained(checkpoint) for checkpoint in TOKENIZER_CHECKPOINTS]
self.tf_tokenizers = [TFBertTokenizer.from_pretrained(checkpoint) for checkpoint in TOKENIZER_CHECKPOINTS]
assert len(self.tokenizers) == len(self.tf_tokenizers)
self.test_sentences = [
"This is a straightforward English test sentence.",
"This one has some weird characters\rto\nsee\r\nif those\u00e9break things.",
"Now we're going to add some Chinese: 一 二 三 一二三",
"And some much more rare Chinese: 齉 堃 齉堃",
"Je vais aussi écrire en français pour tester les accents",
"Classical Irish also has some unusual characters, so in they go: Gaelaċ, ꝼ",
]
self.paired_sentences = list(zip(self.test_sentences, self.test_sentences[::-1]))
def test_output_equivalence(self):
for tokenizer, tf_tokenizer in zip(self.tokenizers, self.tf_tokenizers):
for test_inputs in (self.test_sentences, self.paired_sentences):
python_outputs = tokenizer(test_inputs, return_tensors="tf", padding="longest")
tf_outputs = tf_tokenizer(test_inputs)
for key in python_outputs.keys():
self.assertTrue(tf.reduce_all(python_outputs[key].shape == tf_outputs[key].shape))
self.assertTrue(tf.reduce_all(tf.cast(python_outputs[key], tf.int64) == tf_outputs[key]))
@slow
def test_different_pairing_styles(self):
for tf_tokenizer in self.tf_tokenizers:
merged_outputs = tf_tokenizer(self.paired_sentences)
separated_outputs = tf_tokenizer(
text=[sentence[0] for sentence in self.paired_sentences],
text_pair=[sentence[1] for sentence in self.paired_sentences],
)
for key in merged_outputs.keys():
self.assertTrue(tf.reduce_all(tf.cast(merged_outputs[key], tf.int64) == separated_outputs[key]))
@slow
def test_graph_mode(self):
for tf_tokenizer in self.tf_tokenizers:
compiled_tokenizer = tf.function(tf_tokenizer)
for test_inputs in (self.test_sentences, self.paired_sentences):
test_inputs = tf.constant(test_inputs)
compiled_outputs = compiled_tokenizer(test_inputs)
eager_outputs = tf_tokenizer(test_inputs)
for key in eager_outputs.keys():
self.assertTrue(tf.reduce_all(eager_outputs[key] == compiled_outputs[key]))
@slow
def test_export_for_inference(self):
for tf_tokenizer in self.tf_tokenizers:
model = ModelToSave(tokenizer=tf_tokenizer)
test_inputs = tf.convert_to_tensor(self.test_sentences)
out = model(test_inputs) # Build model with some sample inputs
with TemporaryDirectory() as tempdir:
save_path = Path(tempdir) / "saved.model"
model.export(save_path)
loaded_model = tf.saved_model.load(save_path)
loaded_output = loaded_model.serve(test_inputs)
# We may see small differences because the loaded model is compiled, so we need an epsilon for the test
self.assertLessEqual(tf.reduce_max(tf.abs(out - loaded_output)), 1e-5)

View File

@ -1,131 +0,0 @@
import unittest
from pathlib import Path
from tempfile import TemporaryDirectory
from transformers import AutoConfig, TFGPT2LMHeadModel, is_keras_nlp_available, is_tf_available
from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
from transformers.testing_utils import require_keras_nlp, require_tf, slow
if is_tf_available():
import tensorflow as tf
if is_keras_nlp_available():
from transformers.models.gpt2 import TFGPT2Tokenizer
TOKENIZER_CHECKPOINTS = ["openai-community/gpt2"]
TINY_MODEL_CHECKPOINT = "openai-community/gpt2"
if is_tf_available():
class ModelToSave(tf.Module):
def __init__(self, tokenizer):
super().__init__()
self.tokenizer = tokenizer
config = AutoConfig.from_pretrained(TINY_MODEL_CHECKPOINT)
self.model = TFGPT2LMHeadModel.from_config(config)
@tf.function(input_signature=(tf.TensorSpec((None,), tf.string, name="text"),))
def serving(self, text):
tokenized = self.tokenizer(text)
input_ids_dense = tokenized["input_ids"].to_tensor()
input_mask = tf.cast(input_ids_dense > 0, tf.int32)
# input_mask = tf.reshape(input_mask, [-1, MAX_SEQ_LEN])
outputs = self.model(input_ids=input_ids_dense, attention_mask=input_mask)["logits"]
return outputs
@require_tf
@require_keras_nlp
class GPTTokenizationTest(unittest.TestCase):
# The TF tokenizers are usually going to be used as pretrained tokenizers from existing model checkpoints,
# so that's what we focus on here.
def setUp(self):
super().setUp()
self.tokenizers = [GPT2Tokenizer.from_pretrained(checkpoint) for checkpoint in (TOKENIZER_CHECKPOINTS)]
self.tf_tokenizers = [TFGPT2Tokenizer.from_pretrained(checkpoint) for checkpoint in TOKENIZER_CHECKPOINTS]
assert len(self.tokenizers) == len(self.tf_tokenizers)
self.test_sentences = [
"This is a straightforward English test sentence.",
"This one has some weird characters\rto\nsee\r\nif those\u00e9break things.",
"Now we're going to add some Chinese: 一 二 三 一二三",
"And some much more rare Chinese: 齉 堃 齉堃",
"Je vais aussi écrire en français pour tester les accents",
"Classical Irish also has some unusual characters, so in they go: Gaelaċ, ꝼ",
]
self.paired_sentences = list(zip(self.test_sentences, self.test_sentences[::-1]))
def test_output_equivalence(self):
for tokenizer, tf_tokenizer in zip(self.tokenizers, self.tf_tokenizers):
for test_inputs in self.test_sentences:
python_outputs = tokenizer([test_inputs], return_tensors="tf")
tf_outputs = tf_tokenizer([test_inputs])
for key in python_outputs.keys():
# convert them to numpy to avoid messing with ragged tensors
python_outputs_values = python_outputs[key].numpy()
tf_outputs_values = tf_outputs[key].numpy()
self.assertTrue(tf.reduce_all(python_outputs_values.shape == tf_outputs_values.shape))
self.assertTrue(tf.reduce_all(tf.cast(python_outputs_values, tf.int64) == tf_outputs_values))
@slow
def test_graph_mode(self):
for tf_tokenizer in self.tf_tokenizers:
compiled_tokenizer = tf.function(tf_tokenizer)
for test_inputs in self.test_sentences:
test_inputs = tf.constant(test_inputs)
compiled_outputs = compiled_tokenizer(test_inputs)
eager_outputs = tf_tokenizer(test_inputs)
for key in eager_outputs.keys():
self.assertTrue(tf.reduce_all(eager_outputs[key] == compiled_outputs[key]))
@slow
def test_saved_model(self):
for tf_tokenizer in self.tf_tokenizers:
model = ModelToSave(tokenizer=tf_tokenizer)
test_inputs = tf.convert_to_tensor([self.test_sentences[0]])
out = model.serving(test_inputs) # Build model with some sample inputs
with TemporaryDirectory() as tempdir:
save_path = Path(tempdir) / "saved.model"
tf.saved_model.save(model, save_path, signatures={"serving_default": model.serving})
loaded_model = tf.saved_model.load(save_path)
loaded_output = loaded_model.signatures["serving_default"](test_inputs)["output_0"]
# We may see small differences because the loaded model is compiled, so we need an epsilon for the test
self.assertTrue(tf.reduce_all(out == loaded_output))
@slow
def test_from_config(self):
for tf_tokenizer in self.tf_tokenizers:
test_inputs = tf.convert_to_tensor([self.test_sentences[0]])
out = tf_tokenizer(test_inputs) # Build model with some sample inputs
config = tf_tokenizer.get_config()
model_from_config = TFGPT2Tokenizer.from_config(config)
from_config_output = model_from_config(test_inputs)
for key in from_config_output.keys():
self.assertTrue(tf.reduce_all(from_config_output[key] == out[key]))
@slow
def test_padding(self):
for tf_tokenizer in self.tf_tokenizers:
# for the test to run
tf_tokenizer.pad_token_id = 123123
for max_length in [3, 5, 1024]:
test_inputs = tf.convert_to_tensor([self.test_sentences[0]])
out = tf_tokenizer(test_inputs, max_length=max_length)
out_length = out["input_ids"].numpy().shape[1]
assert out_length == max_length

View File

@ -34,7 +34,6 @@ from transformers import (
from transformers.models.layoutlmv3.tokenization_layoutlmv3 import VOCAB_FILES_NAMES, LayoutLMv3Tokenizer from transformers.models.layoutlmv3.tokenization_layoutlmv3 import VOCAB_FILES_NAMES, LayoutLMv3Tokenizer
from transformers.testing_utils import ( from transformers.testing_utils import (
require_pandas, require_pandas,
require_tf,
require_tokenizers, require_tokenizers,
require_torch, require_torch,
slow, slow,
@ -2306,42 +2305,6 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_np_encode_plus_sent_to_model(self): def test_np_encode_plus_sent_to_model(self):
pass pass
@require_tf
@slow
def test_tf_encode_plus_sent_to_model(self):
from transformers import TF_MODEL_MAPPING, TOKENIZER_MAPPING
MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(TF_MODEL_MAPPING, TOKENIZER_MAPPING)
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
self.skipTest(f"{tokenizer.__class__} is not in the MODEL_TOKENIZER_MAPPING")
config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
config = config_class()
if config.is_encoder_decoder or config.pad_token_id is None:
self.skipTest(reason="Model is an encoder-decoder or has no pad token id set.")
model = model_class(config)
# Make sure the model contains at least the full vocabulary size in its embedding matrix
self.assertGreaterEqual(model.config.vocab_size, len(tokenizer))
# Build sequence
first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
boxes = [[1000, 1000, 1000, 1000] for _ in range(len(first_ten_tokens))]
encoded_sequence = tokenizer.encode_plus(first_ten_tokens, boxes=boxes, return_tensors="tf")
batch_encoded_sequence = tokenizer.batch_encode_plus(
[first_ten_tokens, first_ten_tokens], boxes=[boxes, boxes], return_tensors="tf"
)
# This should not fail
model(encoded_sequence)
model(batch_encoded_sequence)
@unittest.skip(reason="Chat is not supported") @unittest.skip(reason="Chat is not supported")
def test_chat_template(self): def test_chat_template(self):
pass pass

View File

@ -24,7 +24,6 @@ from transformers.testing_utils import (
require_essentia, require_essentia,
require_librosa, require_librosa,
require_scipy, require_scipy,
require_tf,
require_torch, require_torch,
) )
from transformers.utils.import_utils import ( from transformers.utils.import_utils import (
@ -231,28 +230,6 @@ class Pop2PianoFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittes
# check shape # check shape
self.assertEqual(len(input_features["input_features"].shape), 3) self.assertEqual(len(input_features["input_features"].shape), 3)
@require_tf
def test_batch_feature_tf(self):
import tensorflow as tf
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
speech_input1 = np.zeros([1_000_000], dtype=np.float32)
speech_input2 = np.ones([2_000_000], dtype=np.float32)
speech_input3 = np.random.randint(low=0, high=10, size=500_000).astype(np.float32)
input_features = feature_extractor(
[speech_input1, speech_input2, speech_input3],
sampling_rate=[44_100, 16_000, 48_000],
return_tensors="tf",
return_attention_mask=True,
)
# check tf tensor or not
self.assertTrue(tf.is_tensor(input_features["input_features"]))
# check shape
self.assertEqual(len(input_features["input_features"].shape), 3)
@unittest.skip( @unittest.skip(
"Pop2PianoFeatureExtractor does not supports padding externally (while processing audios in batches padding is automatically applied to max_length)" "Pop2PianoFeatureExtractor does not supports padding externally (while processing audios in batches padding is automatically applied to max_length)"
) )

View File

@ -17,15 +17,10 @@ import unittest
import numpy as np import numpy as np
from transformers.testing_utils import ( from transformers.testing_utils import require_torch, require_torchvision, require_vision
require_tf, from transformers.utils import is_torch_available, is_vision_available
require_torch,
require_torchvision,
require_vision,
)
from transformers.utils import is_tf_available, is_torch_available, is_vision_available
from ...test_processing_common import ProcessorTesterMixin, prepare_image_inputs from ...test_processing_common import ProcessorTesterMixin
if is_vision_available(): if is_vision_available():
@ -38,11 +33,6 @@ if is_torch_available():
from transformers.models.sam.image_processing_sam import _mask_to_rle_pytorch from transformers.models.sam.image_processing_sam import _mask_to_rle_pytorch
if is_tf_available():
import tensorflow as tf
from transformers.models.sam.image_processing_sam import _mask_to_rle_tf
@require_vision @require_vision
@require_torchvision @require_torchvision
@ -202,143 +192,3 @@ class SamProcessorTest(ProcessorTesterMixin, unittest.TestCase):
self.assertEqual(len(rle), 1) self.assertEqual(len(rle), 1)
self.assertEqual(rle[0]["size"], [2, 2]) self.assertEqual(rle[0]["size"], [2, 2])
self.assertEqual(rle[0]["counts"], [1, 3]) # 1 zero, followed by 3 ones self.assertEqual(rle[0]["counts"], [1, 3]) # 1 zero, followed by 3 ones
@require_vision
@require_tf
class TFSamProcessorTest(unittest.TestCase):
def setUp(self):
self.tmpdirname = tempfile.mkdtemp()
image_processor = SamImageProcessor()
processor = SamProcessor(image_processor)
processor.save_pretrained(self.tmpdirname)
def get_image_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
def tearDown(self):
shutil.rmtree(self.tmpdirname)
# This is to avoid repeating the skipping of the common tests
def prepare_image_inputs(self):
"""This function prepares a list of PIL images."""
return prepare_image_inputs()
def test_save_load_pretrained_additional_features(self):
processor = SamProcessor(image_processor=self.get_image_processor())
processor.save_pretrained(self.tmpdirname)
image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
processor = SamProcessor.from_pretrained(self.tmpdirname, do_normalize=False, padding_value=1.0)
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
self.assertIsInstance(processor.image_processor, SamImageProcessor)
def test_image_processor(self):
image_processor = self.get_image_processor()
processor = SamProcessor(image_processor=image_processor)
image_input = self.prepare_image_inputs()
input_feat_extract = image_processor(image_input, return_tensors="np")
input_processor = processor(images=image_input, return_tensors="np")
input_feat_extract.pop("original_sizes") # pop original_sizes as it is popped in the processor
input_feat_extract.pop("reshaped_input_sizes") # pop reshaped_input_sizes as it is popped in the processor
for key in input_feat_extract.keys():
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
@require_tf
def test_post_process_masks(self):
image_processor = self.get_image_processor()
processor = SamProcessor(image_processor=image_processor)
dummy_masks = [tf.ones((1, 3, 5, 5))]
original_sizes = [[1764, 2646]]
reshaped_input_size = [[683, 1024]]
masks = processor.post_process_masks(dummy_masks, original_sizes, reshaped_input_size, return_tensors="tf")
self.assertEqual(masks[0].shape, (1, 3, 1764, 2646))
masks = processor.post_process_masks(
dummy_masks,
tf.convert_to_tensor(original_sizes),
tf.convert_to_tensor(reshaped_input_size),
return_tensors="tf",
)
self.assertEqual(masks[0].shape, (1, 3, 1764, 2646))
# should also work with np
dummy_masks = [np.ones((1, 3, 5, 5))]
masks = processor.post_process_masks(
dummy_masks, np.array(original_sizes), np.array(reshaped_input_size), return_tensors="tf"
)
self.assertEqual(masks[0].shape, (1, 3, 1764, 2646))
dummy_masks = [[1, 0], [0, 1]]
with self.assertRaises(tf.errors.InvalidArgumentError):
masks = processor.post_process_masks(
dummy_masks, np.array(original_sizes), np.array(reshaped_input_size), return_tensors="tf"
)
def test_rle_encoding(self):
"""
Test the run-length encoding function.
"""
# Test that a mask of all zeros returns a single run [height * width].
input_mask = tf.zeros((1, 2, 2), dtype=tf.int64) # shape: 1 x 2 x 2
rle = _mask_to_rle_tf(input_mask)
self.assertEqual(len(rle), 1)
self.assertEqual(rle[0]["size"], [2, 2])
# For a 2x2 all-zero mask, we expect a single run of length 4:
self.assertEqual(rle[0]["counts"], [4])
# Test that a mask of all ones returns [0, height * width].
input_mask = tf.ones((1, 2, 2), dtype=tf.int64) # shape: 1 x 2 x 2
rle = _mask_to_rle_tf(input_mask)
self.assertEqual(len(rle), 1)
self.assertEqual(rle[0]["size"], [2, 2])
# For a 2x2 all-one mask, we expect two runs: [0, 4].
self.assertEqual(rle[0]["counts"], [0, 4])
# Test a mask with mixed 0s and 1s to ensure the run-length encoding is correct.
# Example mask:
# Row 0: [0, 1]
# Row 1: [1, 1]
# This is shape (1, 2, 2).
# Flattened in Fortran order -> [0, 1, 1, 1].
# The RLE for [0,1,1,1] is [1, 3].
input_mask = tf.constant([[[0, 1], [1, 1]]], dtype=tf.int64)
rle = _mask_to_rle_tf(input_mask)
self.assertEqual(len(rle), 1)
self.assertEqual(rle[0]["size"], [2, 2])
self.assertEqual(rle[0]["counts"], [1, 3]) # 1 zero, followed by 3 ones
@require_vision
@require_torchvision
class SamProcessorEquivalenceTest(unittest.TestCase):
def setUp(self):
self.tmpdirname = tempfile.mkdtemp()
image_processor = SamImageProcessor()
processor = SamProcessor(image_processor)
processor.save_pretrained(self.tmpdirname)
def get_image_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
def tearDown(self):
shutil.rmtree(self.tmpdirname)
# This is to avoid repeating the skipping of the common tests
def prepare_image_inputs(self):
"""This function prepares a list of PIL images."""
return prepare_image_inputs()

View File

@ -18,7 +18,7 @@ import numpy as np
from transformers.models.whisper import WhisperTokenizer, WhisperTokenizerFast from transformers.models.whisper import WhisperTokenizer, WhisperTokenizerFast
from transformers.models.whisper.tokenization_whisper import _combine_tokens_into_words, _find_longest_common_sequence from transformers.models.whisper.tokenization_whisper import _combine_tokens_into_words, _find_longest_common_sequence
from transformers.testing_utils import require_flax, require_tf, require_torch, slow from transformers.testing_utils import require_flax, require_torch, slow
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin
@ -588,15 +588,6 @@ class SpeechToTextTokenizerMultilinguialTest(unittest.TestCase):
self.assertListEqual(WhisperTokenizer._convert_to_list(np_array), test_list) self.assertListEqual(WhisperTokenizer._convert_to_list(np_array), test_list)
self.assertListEqual(WhisperTokenizerFast._convert_to_list(np_array), test_list) self.assertListEqual(WhisperTokenizerFast._convert_to_list(np_array), test_list)
@require_tf
def test_convert_to_list_tf(self):
import tensorflow as tf
test_list = [[1, 2, 3], [4, 5, 6]]
tf_tensor = tf.constant(test_list)
self.assertListEqual(WhisperTokenizer._convert_to_list(tf_tensor), test_list)
self.assertListEqual(WhisperTokenizerFast._convert_to_list(tf_tensor), test_list)
@require_flax @require_flax
def test_convert_to_list_jax(self): def test_convert_to_list_jax(self):
import jax.numpy as jnp import jax.numpy as jnp

View File

@ -1,100 +0,0 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import is_tf_available
from transformers.testing_utils import require_tf
if is_tf_available():
import tensorflow as tf
from tensorflow.python.eager import context
from tensorflow.python.framework import ops
from transformers import GradientAccumulator, create_optimizer
@require_tf
class OptimizationFTest(unittest.TestCase):
def assertListAlmostEqual(self, list1, list2, tol):
self.assertEqual(len(list1), len(list2))
for a, b in zip(list1, list2):
self.assertAlmostEqual(a, b, delta=tol)
def testGradientAccumulator(self):
accumulator = GradientAccumulator()
accumulator([tf.constant([1.0, 2.0])])
accumulator([tf.constant([-2.0, 1.0])])
accumulator([tf.constant([-1.0, 2.0])])
with self.assertRaises(ValueError):
accumulator([tf.constant([1.0, 1.0]), tf.constant([2.0, 2.0])])
self.assertEqual(accumulator.step, 3)
self.assertEqual(len(accumulator.gradients), 1)
self.assertListAlmostEqual(accumulator.gradients[0].numpy().tolist(), [-2.0, 5.0], tol=1e-2)
accumulator.reset()
self.assertEqual(accumulator.step, 0)
self.assertListAlmostEqual(accumulator.gradients[0].numpy().tolist(), [0.0, 0.0], tol=1e-2)
def testGradientAccumulatorDistributionStrategy(self):
context._context = None
ops.enable_eager_execution_internal()
physical_devices = tf.config.list_physical_devices("CPU")
if len(physical_devices) == 1:
tf.config.set_logical_device_configuration(
physical_devices[0], [tf.config.LogicalDeviceConfiguration(), tf.config.LogicalDeviceConfiguration()]
)
devices = tf.config.list_logical_devices(device_type="CPU")
strategy = tf.distribute.MirroredStrategy(devices=devices[:2])
with strategy.scope():
accumulator = GradientAccumulator()
variable = tf.Variable([4.0, 3.0])
optimizer, _ = create_optimizer(5e-5, 10, 5)
gradient_placeholder = tf.Variable([0.0, 0.0], trainable=False)
def accumulate_on_replica(gradient):
accumulator([gradient])
def apply_on_replica():
optimizer.apply_gradients(list(zip(accumulator.gradients, [variable])))
@tf.function
def accumulate(grad1, grad2):
with strategy.scope():
local_variables = strategy.experimental_local_results(gradient_placeholder)
local_variables[0].assign(grad1)
local_variables[1].assign(grad2)
strategy.run(accumulate_on_replica, args=(gradient_placeholder,))
@tf.function
def apply_grad():
with strategy.scope():
strategy.run(apply_on_replica)
def _check_local_values(grad1, grad2):
values = strategy.experimental_local_results(accumulator._gradients[0])
self.assertListAlmostEqual(values[0].value(), grad1, tol=1e-2)
self.assertListAlmostEqual(values[1].value(), grad2, tol=1e-2)
accumulate([1.0, 2.0], [-1.0, 1.0])
accumulate([3.0, -1.0], [-1.0, -1.0])
accumulate([-2.0, 2.0], [3.0, -2.0])
self.assertEqual(accumulator.step, 3)
_check_local_values([2.0, 3.0], [1.0, -2.0])
apply_grad()
self.assertListAlmostEqual(variable.value(), [4.0, 3.0], tol=1e-2)
accumulator.reset()
self.assertEqual(accumulator.step, 0)
_check_local_values([0.0, 0.0], [0.0, 0.0])

View File

@ -28,7 +28,6 @@ from transformers.testing_utils import (
compare_pipeline_output_to_hub_spec, compare_pipeline_output_to_hub_spec,
is_pipeline_test, is_pipeline_test,
nested_simplify, nested_simplify,
require_tf,
require_torch, require_torch,
require_torchaudio, require_torchaudio,
slow, slow,
@ -193,11 +192,6 @@ class AudioClassificationPipelineTests(unittest.TestCase):
], ],
) )
@require_tf
@unittest.skip(reason="Audio classification is not implemented for TF")
def test_small_model_tf(self):
pass
@require_torch @require_torch
@slow @slow
def test_top_k_none_returns_all_labels(self): def test_top_k_none_returns_all_labels(self):

View File

@ -40,7 +40,6 @@ from transformers.testing_utils import (
is_torch_available, is_torch_available,
nested_simplify, nested_simplify,
require_pyctcdecode, require_pyctcdecode,
require_tf,
require_torch, require_torch,
require_torch_accelerator, require_torch_accelerator,
require_torchaudio, require_torchaudio,
@ -326,10 +325,6 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
): ):
_ = speech_recognizer(filename, return_timestamps="char") _ = speech_recognizer(filename, return_timestamps="char")
@require_tf
def test_small_model_tf(self):
self.skipTest(reason="Tensorflow not supported yet.")
@require_torch @require_torch
@unittest.skip("TODO (joao, eustache): this test is failing, find the breaking PR and fix the cause or the test") @unittest.skip("TODO (joao, eustache): this test is failing, find the breaking PR and fix the cause or the test")
def test_torch_small_no_tokenizer_files(self): def test_torch_small_no_tokenizer_files(self):

View File

@ -48,8 +48,6 @@ from transformers.testing_utils import (
is_pipeline_test, is_pipeline_test,
is_staging_test, is_staging_test,
nested_simplify, nested_simplify,
require_tensorflow_probability,
require_tf,
require_torch, require_torch,
require_torch_accelerator, require_torch_accelerator,
require_torch_multi_accelerator, require_torch_multi_accelerator,
@ -177,20 +175,6 @@ class CommonPipelineTest(unittest.TestCase):
results.append(out) results.append(out)
self.assertEqual(len(results), 10) self.assertEqual(len(results), 10)
@require_tf
def test_iterator_data_tf(self):
def data(n: int):
for _ in range(n):
yield "This is a test"
pipe = pipeline(model="hf-internal-testing/tiny-random-distilbert", framework="tf")
out = pipe("This is a test")
results = []
for out in pipe(data(10)):
self.assertEqual(nested_simplify(out), {"label": "LABEL_0", "score": 0.504})
results.append(out)
self.assertEqual(len(results), 10)
@require_torch @require_torch
def test_unbatch_attentions_hidden_states(self): def test_unbatch_attentions_hidden_states(self):
model = DistilBertForSequenceClassification.from_pretrained( model = DistilBertForSequenceClassification.from_pretrained(
@ -262,9 +246,9 @@ class CommonPipelineTest(unittest.TestCase):
@is_pipeline_test @is_pipeline_test
@require_torch
class PipelineScikitCompatTest(unittest.TestCase): class PipelineScikitCompatTest(unittest.TestCase):
@require_torch def test_pipeline_predict(self):
def test_pipeline_predict_pt(self):
data = ["This is a test"] data = ["This is a test"]
text_classifier = pipeline( text_classifier = pipeline(
@ -275,20 +259,7 @@ class PipelineScikitCompatTest(unittest.TestCase):
actual_output = text_classifier.predict(data) actual_output = text_classifier.predict(data)
self.assertEqual(expected_output, actual_output) self.assertEqual(expected_output, actual_output)
@require_tf def test_pipeline_transform(self):
def test_pipeline_predict_tf(self):
data = ["This is a test"]
text_classifier = pipeline(
task="text-classification", model="hf-internal-testing/tiny-random-distilbert", framework="tf"
)
expected_output = [{"label": ANY(str), "score": ANY(float)}]
actual_output = text_classifier.predict(data)
self.assertEqual(expected_output, actual_output)
@require_torch
def test_pipeline_transform_pt(self):
data = ["This is a test"] data = ["This is a test"]
text_classifier = pipeline( text_classifier = pipeline(
@ -299,18 +270,6 @@ class PipelineScikitCompatTest(unittest.TestCase):
actual_output = text_classifier.transform(data) actual_output = text_classifier.transform(data)
self.assertEqual(expected_output, actual_output) self.assertEqual(expected_output, actual_output)
@require_tf
def test_pipeline_transform_tf(self):
data = ["This is a test"]
text_classifier = pipeline(
task="text-classification", model="hf-internal-testing/tiny-random-distilbert", framework="tf"
)
expected_output = [{"label": ANY(str), "score": ANY(float)}]
actual_output = text_classifier.transform(data)
self.assertEqual(expected_output, actual_output)
@is_pipeline_test @is_pipeline_test
class PipelinePadTest(unittest.TestCase): class PipelinePadTest(unittest.TestCase):
@ -620,23 +579,6 @@ class PipelineUtilsTest(unittest.TestCase):
gc.collect() gc.collect()
backend_empty_cache(torch_device) backend_empty_cache(torch_device)
@slow
@require_tf
def test_load_default_pipelines_tf(self):
from transformers.modeling_tf_utils import keras
from transformers.pipelines import SUPPORTED_TASKS
set_seed_fn = lambda: keras.utils.set_random_seed(0) # noqa: E731
for task in SUPPORTED_TASKS.keys():
if task == "table-question-answering":
# test table in separate test due to more dependencies
continue
self.check_default_pipeline(task, "tf", set_seed_fn, self.check_models_equal_tf)
# clean-up as much as possible GPU memory occupied by TF
gc.collect()
@slow @slow
@require_torch @require_torch
def test_load_default_pipelines_pt_table_qa(self): def test_load_default_pipelines_pt_table_qa(self):
@ -663,18 +605,6 @@ class PipelineUtilsTest(unittest.TestCase):
pipe = pipeline("text-generation", device=torch_device) pipe = pipeline("text-generation", device=torch_device)
_ = pipe("Hello") _ = pipe("Hello")
@slow
@require_tf
@require_tensorflow_probability
def test_load_default_pipelines_tf_table_qa(self):
import tensorflow as tf
set_seed_fn = lambda: tf.random.set_seed(0) # noqa: E731
self.check_default_pipeline("table-question-answering", "tf", set_seed_fn, self.check_models_equal_tf)
# clean-up as much as possible GPU memory occupied by PyTorch
gc.collect()
def check_default_pipeline(self, task, framework, set_seed_fn, check_models_equal_fn): def check_default_pipeline(self, task, framework, set_seed_fn, check_models_equal_fn):
from transformers.pipelines import SUPPORTED_TASKS, pipeline from transformers.pipelines import SUPPORTED_TASKS, pipeline

View File

@ -24,7 +24,6 @@ from transformers.testing_utils import (
compare_pipeline_output_to_hub_spec, compare_pipeline_output_to_hub_spec,
is_pipeline_test, is_pipeline_test,
nested_simplify, nested_simplify,
require_tf,
require_timm, require_timm,
require_torch, require_torch,
require_vision, require_vision,
@ -123,11 +122,6 @@ class DepthEstimationPipelineTests(unittest.TestCase):
for single_output in outputs: for single_output in outputs:
compare_pipeline_output_to_hub_spec(single_output, DepthEstimationOutput) compare_pipeline_output_to_hub_spec(single_output, DepthEstimationOutput)
@require_tf
@unittest.skip(reason="Depth estimation is not implemented in TF")
def test_small_model_tf(self):
pass
@slow @slow
@require_torch @require_torch
def test_large_model_pt(self): def test_large_model_pt(self):

View File

@ -27,7 +27,6 @@ from transformers.testing_utils import (
nested_simplify, nested_simplify,
require_detectron2, require_detectron2,
require_pytesseract, require_pytesseract,
require_tf,
require_torch, require_torch,
require_torch_bf16, require_torch_bf16,
require_vision, require_vision,
@ -423,8 +422,3 @@ class DocumentQuestionAnsweringPipelineTests(unittest.TestCase):
question = "What is the invoice number?" question = "What is the invoice number?"
outputs = dqa_pipeline(image=image, question=question, top_k=2) outputs = dqa_pipeline(image=image, question=question, top_k=2)
self.assertEqual(nested_simplify(outputs, decimals=4), [{"answer": "us-001"}]) self.assertEqual(nested_simplify(outputs, decimals=4), [{"answer": "us-001"}])
@require_tf
@unittest.skip(reason="Document question answering not implemented in TF")
def test_small_model_tf(self):
pass

View File

@ -23,19 +23,15 @@ from transformers import (
TF_MODEL_MAPPING, TF_MODEL_MAPPING,
FeatureExtractionPipeline, FeatureExtractionPipeline,
LxmertConfig, LxmertConfig,
is_tf_available,
is_torch_available, is_torch_available,
pipeline, pipeline,
) )
from transformers.testing_utils import is_pipeline_test, nested_simplify, require_tf, require_torch from transformers.testing_utils import is_pipeline_test, nested_simplify, require_torch
if is_torch_available(): if is_torch_available():
import torch import torch
if is_tf_available():
import tensorflow as tf
@is_pipeline_test @is_pipeline_test
class FeatureExtractionPipelineTests(unittest.TestCase): class FeatureExtractionPipelineTests(unittest.TestCase):
@ -52,16 +48,6 @@ class FeatureExtractionPipelineTests(unittest.TestCase):
nested_simplify(outputs), nested_simplify(outputs),
[[[2.287, 1.234, 0.042, 1.53, 1.306, 0.879, -0.526, -1.71, -1.276, 0.756, -0.775, -1.048, -0.25, -0.595, -0.137, -0.598, 2.022, -0.812, 0.284, -0.488, -0.391, -0.403, -0.525, -0.061, -0.228, 1.086, 0.378, -0.14, 0.599, -0.087, -2.259, -0.098], [1.676, 0.232, -1.508, -0.145, 1.798, -1.388, 1.331, -0.37, -0.939, 0.043, 0.06, -0.414, -1.408, 0.24, 0.622, -0.55, -0.569, 1.873, -0.706, 1.924, -0.254, 1.927, -0.423, 0.152, -0.952, 0.509, -0.496, -0.968, 0.093, -1.049, -0.65, 0.312], [0.207, -0.775, -1.822, 0.321, -0.71, -0.201, 0.3, 1.146, -0.233, -0.753, -0.305, 1.309, -1.47, -0.21, 1.802, -1.555, -1.175, 1.323, -0.303, 0.722, -0.076, 0.103, -1.406, 1.931, 0.091, 0.237, 1.172, 1.607, 0.253, -0.9, -1.068, 0.438], [0.615, 1.077, 0.171, -0.175, 1.3, 0.901, -0.653, -0.138, 0.341, -0.654, -0.184, -0.441, -0.424, 0.356, -0.075, 0.26, -1.023, 0.814, 0.524, -0.904, -0.204, -0.623, 1.234, -1.03, 2.594, 0.56, 1.831, -0.199, -1.508, -0.492, -1.687, -2.165], [0.129, 0.008, -1.279, -0.412, -0.004, 1.663, 0.196, 0.104, 0.123, 0.119, 0.635, 1.757, 2.334, -0.799, -1.626, -1.26, 0.595, -0.316, -1.399, 0.232, 0.264, 1.386, -1.171, -0.256, -0.256, -1.944, 1.168, -0.368, -0.714, -0.51, 0.454, 1.148], [-0.32, 0.29, -1.309, -0.177, 0.453, 0.636, -0.024, 0.509, 0.931, -1.754, -1.575, 0.786, 0.046, -1.165, -1.416, 1.373, 1.293, -0.285, -1.541, -1.186, -0.106, -0.994, 2.001, 0.972, -0.02, 1.654, -0.236, 0.643, 1.02, 0.572, -0.914, -0.154], [0.7, -0.937, 0.441, 0.25, 0.78, -0.022, 0.282, -0.095, 1.558, -0.336, 1.706, 0.884, 1.28, 0.198, -0.796, 1.218, -1.769, 1.197, -0.342, -0.177, -0.645, 1.364, 0.008, -0.597, -0.484, -2.772, -0.696, -0.632, -0.34, -1.527, -0.562, 0.862], [2.504, 0.831, -1.271, -0.033, 0.298, -0.735, 1.339, 1.74, 0.233, -1.424, -0.819, -0.761, 0.291, 0.853, -0.092, -0.885, 0.164, 1.025, 0.907, 0.749, -1.515, -0.545, -1.365, 0.271, 0.034, -2.005, 0.031, 0.244, 0.621, 0.176, 0.336, -1.196], [-0.711, 0.591, -1.001, -0.946, 0.784, -1.66, 1.545, 0.799, -0.857, 1.148, 0.213, -0.285, 0.464, -0.139, 0.79, -1.663, -1.121, 0.575, -0.178, -0.508, 1.565, -0.242, -0.346, 1.024, -1.135, -0.158, -2.101, 0.275, 2.009, -0.425, 0.716, 0.981], [0.912, -1.186, -0.846, -0.421, -1.315, -0.827, 0.309, 0.533, 1.029, -2.343, 1.513, -1.238, 1.487, -0.849, 0.896, -0.927, -0.459, 0.159, 0.177, 0.873, 0.935, 1.433, -0.485, 0.737, 1.327, -0.338, 1.608, -0.47, -0.445, -1.118, -0.213, -0.446], [-0.434, -1.362, -1.098, -1.068, 1.507, 0.003, 0.413, -0.395, 0.897, -0.237, 1.405, -0.344, 1.693, 0.677, 0.097, -0.257, -0.602, 1.026, -1.229, 0.855, -0.713, 1.014, 0.443, 0.238, 0.425, -2.184, 1.933, -1.157, -1.132, -0.597, -0.785, 0.967], [0.58, -0.971, 0.789, -0.468, -0.576, 1.779, 1.747, 1.715, -1.939, 0.125, 0.656, -0.042, -1.024, -1.767, 0.107, -0.408, -0.866, -1.774, 1.248, 0.939, -0.033, 1.523, 1.168, -0.744, 0.209, -0.168, -0.316, 0.207, -0.432, 0.047, -0.646, -0.664], [-0.185, -0.613, -1.695, 1.602, -0.32, -0.277, 0.967, 0.728, -0.965, -0.234, 1.069, -0.63, -1.631, 0.711, 0.426, 1.298, -0.191, -0.467, -0.771, 0.971, -0.118, -1.577, -2.064, -0.055, -0.59, 0.642, -0.997, 1.251, 0.538, 1.367, 0.106, 1.704]]]) # fmt: skip [[[2.287, 1.234, 0.042, 1.53, 1.306, 0.879, -0.526, -1.71, -1.276, 0.756, -0.775, -1.048, -0.25, -0.595, -0.137, -0.598, 2.022, -0.812, 0.284, -0.488, -0.391, -0.403, -0.525, -0.061, -0.228, 1.086, 0.378, -0.14, 0.599, -0.087, -2.259, -0.098], [1.676, 0.232, -1.508, -0.145, 1.798, -1.388, 1.331, -0.37, -0.939, 0.043, 0.06, -0.414, -1.408, 0.24, 0.622, -0.55, -0.569, 1.873, -0.706, 1.924, -0.254, 1.927, -0.423, 0.152, -0.952, 0.509, -0.496, -0.968, 0.093, -1.049, -0.65, 0.312], [0.207, -0.775, -1.822, 0.321, -0.71, -0.201, 0.3, 1.146, -0.233, -0.753, -0.305, 1.309, -1.47, -0.21, 1.802, -1.555, -1.175, 1.323, -0.303, 0.722, -0.076, 0.103, -1.406, 1.931, 0.091, 0.237, 1.172, 1.607, 0.253, -0.9, -1.068, 0.438], [0.615, 1.077, 0.171, -0.175, 1.3, 0.901, -0.653, -0.138, 0.341, -0.654, -0.184, -0.441, -0.424, 0.356, -0.075, 0.26, -1.023, 0.814, 0.524, -0.904, -0.204, -0.623, 1.234, -1.03, 2.594, 0.56, 1.831, -0.199, -1.508, -0.492, -1.687, -2.165], [0.129, 0.008, -1.279, -0.412, -0.004, 1.663, 0.196, 0.104, 0.123, 0.119, 0.635, 1.757, 2.334, -0.799, -1.626, -1.26, 0.595, -0.316, -1.399, 0.232, 0.264, 1.386, -1.171, -0.256, -0.256, -1.944, 1.168, -0.368, -0.714, -0.51, 0.454, 1.148], [-0.32, 0.29, -1.309, -0.177, 0.453, 0.636, -0.024, 0.509, 0.931, -1.754, -1.575, 0.786, 0.046, -1.165, -1.416, 1.373, 1.293, -0.285, -1.541, -1.186, -0.106, -0.994, 2.001, 0.972, -0.02, 1.654, -0.236, 0.643, 1.02, 0.572, -0.914, -0.154], [0.7, -0.937, 0.441, 0.25, 0.78, -0.022, 0.282, -0.095, 1.558, -0.336, 1.706, 0.884, 1.28, 0.198, -0.796, 1.218, -1.769, 1.197, -0.342, -0.177, -0.645, 1.364, 0.008, -0.597, -0.484, -2.772, -0.696, -0.632, -0.34, -1.527, -0.562, 0.862], [2.504, 0.831, -1.271, -0.033, 0.298, -0.735, 1.339, 1.74, 0.233, -1.424, -0.819, -0.761, 0.291, 0.853, -0.092, -0.885, 0.164, 1.025, 0.907, 0.749, -1.515, -0.545, -1.365, 0.271, 0.034, -2.005, 0.031, 0.244, 0.621, 0.176, 0.336, -1.196], [-0.711, 0.591, -1.001, -0.946, 0.784, -1.66, 1.545, 0.799, -0.857, 1.148, 0.213, -0.285, 0.464, -0.139, 0.79, -1.663, -1.121, 0.575, -0.178, -0.508, 1.565, -0.242, -0.346, 1.024, -1.135, -0.158, -2.101, 0.275, 2.009, -0.425, 0.716, 0.981], [0.912, -1.186, -0.846, -0.421, -1.315, -0.827, 0.309, 0.533, 1.029, -2.343, 1.513, -1.238, 1.487, -0.849, 0.896, -0.927, -0.459, 0.159, 0.177, 0.873, 0.935, 1.433, -0.485, 0.737, 1.327, -0.338, 1.608, -0.47, -0.445, -1.118, -0.213, -0.446], [-0.434, -1.362, -1.098, -1.068, 1.507, 0.003, 0.413, -0.395, 0.897, -0.237, 1.405, -0.344, 1.693, 0.677, 0.097, -0.257, -0.602, 1.026, -1.229, 0.855, -0.713, 1.014, 0.443, 0.238, 0.425, -2.184, 1.933, -1.157, -1.132, -0.597, -0.785, 0.967], [0.58, -0.971, 0.789, -0.468, -0.576, 1.779, 1.747, 1.715, -1.939, 0.125, 0.656, -0.042, -1.024, -1.767, 0.107, -0.408, -0.866, -1.774, 1.248, 0.939, -0.033, 1.523, 1.168, -0.744, 0.209, -0.168, -0.316, 0.207, -0.432, 0.047, -0.646, -0.664], [-0.185, -0.613, -1.695, 1.602, -0.32, -0.277, 0.967, 0.728, -0.965, -0.234, 1.069, -0.63, -1.631, 0.711, 0.426, 1.298, -0.191, -0.467, -0.771, 0.971, -0.118, -1.577, -2.064, -0.055, -0.59, 0.642, -0.997, 1.251, 0.538, 1.367, 0.106, 1.704]]]) # fmt: skip
@require_tf
def test_small_model_tf(self):
feature_extractor = pipeline(
task="feature-extraction", model="hf-internal-testing/tiny-random-distilbert", framework="tf"
)
outputs = feature_extractor("This is a test")
self.assertEqual(
nested_simplify(outputs),
[[[2.287, 1.234, 0.042, 1.53, 1.306, 0.879, -0.526, -1.71, -1.276, 0.756, -0.775, -1.048, -0.25, -0.595, -0.137, -0.598, 2.022, -0.812, 0.284, -0.488, -0.391, -0.403, -0.525, -0.061, -0.228, 1.086, 0.378, -0.14, 0.599, -0.087, -2.259, -0.098], [1.676, 0.232, -1.508, -0.145, 1.798, -1.388, 1.331, -0.37, -0.939, 0.043, 0.06, -0.414, -1.408, 0.24, 0.622, -0.55, -0.569, 1.873, -0.706, 1.924, -0.254, 1.927, -0.423, 0.152, -0.952, 0.509, -0.496, -0.968, 0.093, -1.049, -0.65, 0.312], [0.207, -0.775, -1.822, 0.321, -0.71, -0.201, 0.3, 1.146, -0.233, -0.753, -0.305, 1.309, -1.47, -0.21, 1.802, -1.555, -1.175, 1.323, -0.303, 0.722, -0.076, 0.103, -1.406, 1.931, 0.091, 0.237, 1.172, 1.607, 0.253, -0.9, -1.068, 0.438], [0.615, 1.077, 0.171, -0.175, 1.3, 0.901, -0.653, -0.138, 0.341, -0.654, -0.184, -0.441, -0.424, 0.356, -0.075, 0.26, -1.023, 0.814, 0.524, -0.904, -0.204, -0.623, 1.234, -1.03, 2.594, 0.56, 1.831, -0.199, -1.508, -0.492, -1.687, -2.165], [0.129, 0.008, -1.279, -0.412, -0.004, 1.663, 0.196, 0.104, 0.123, 0.119, 0.635, 1.757, 2.334, -0.799, -1.626, -1.26, 0.595, -0.316, -1.399, 0.232, 0.264, 1.386, -1.171, -0.256, -0.256, -1.944, 1.168, -0.368, -0.714, -0.51, 0.454, 1.148], [-0.32, 0.29, -1.309, -0.177, 0.453, 0.636, -0.024, 0.509, 0.931, -1.754, -1.575, 0.786, 0.046, -1.165, -1.416, 1.373, 1.293, -0.285, -1.541, -1.186, -0.106, -0.994, 2.001, 0.972, -0.02, 1.654, -0.236, 0.643, 1.02, 0.572, -0.914, -0.154], [0.7, -0.937, 0.441, 0.25, 0.78, -0.022, 0.282, -0.095, 1.558, -0.336, 1.706, 0.884, 1.28, 0.198, -0.796, 1.218, -1.769, 1.197, -0.342, -0.177, -0.645, 1.364, 0.008, -0.597, -0.484, -2.772, -0.696, -0.632, -0.34, -1.527, -0.562, 0.862], [2.504, 0.831, -1.271, -0.033, 0.298, -0.735, 1.339, 1.74, 0.233, -1.424, -0.819, -0.761, 0.291, 0.853, -0.092, -0.885, 0.164, 1.025, 0.907, 0.749, -1.515, -0.545, -1.365, 0.271, 0.034, -2.005, 0.031, 0.244, 0.621, 0.176, 0.336, -1.196], [-0.711, 0.591, -1.001, -0.946, 0.784, -1.66, 1.545, 0.799, -0.857, 1.148, 0.213, -0.285, 0.464, -0.139, 0.79, -1.663, -1.121, 0.575, -0.178, -0.508, 1.565, -0.242, -0.346, 1.024, -1.135, -0.158, -2.101, 0.275, 2.009, -0.425, 0.716, 0.981], [0.912, -1.186, -0.846, -0.421, -1.315, -0.827, 0.309, 0.533, 1.029, -2.343, 1.513, -1.238, 1.487, -0.849, 0.896, -0.927, -0.459, 0.159, 0.177, 0.873, 0.935, 1.433, -0.485, 0.737, 1.327, -0.338, 1.608, -0.47, -0.445, -1.118, -0.213, -0.446], [-0.434, -1.362, -1.098, -1.068, 1.507, 0.003, 0.413, -0.395, 0.897, -0.237, 1.405, -0.344, 1.693, 0.677, 0.097, -0.257, -0.602, 1.026, -1.229, 0.855, -0.713, 1.014, 0.443, 0.238, 0.425, -2.184, 1.933, -1.157, -1.132, -0.597, -0.785, 0.967], [0.58, -0.971, 0.789, -0.468, -0.576, 1.779, 1.747, 1.715, -1.939, 0.125, 0.656, -0.042, -1.024, -1.767, 0.107, -0.408, -0.866, -1.774, 1.248, 0.939, -0.033, 1.523, 1.168, -0.744, 0.209, -0.168, -0.316, 0.207, -0.432, 0.047, -0.646, -0.664], [-0.185, -0.613, -1.695, 1.602, -0.32, -0.277, 0.967, 0.728, -0.965, -0.234, 1.069, -0.63, -1.631, 0.711, 0.426, 1.298, -0.191, -0.467, -0.771, 0.971, -0.118, -1.577, -2.064, -0.055, -0.59, 0.642, -0.997, 1.251, 0.538, 1.367, 0.106, 1.704]]]) # fmt: skip
@require_torch @require_torch
def test_tokenization_small_model_pt(self): def test_tokenization_small_model_pt(self):
feature_extractor = pipeline( feature_extractor = pipeline(
@ -102,46 +88,6 @@ class FeatureExtractionPipelineTests(unittest.TestCase):
tokenize_kwargs=tokenize_kwargs, tokenize_kwargs=tokenize_kwargs,
) )
@require_tf
def test_tokenization_small_model_tf(self):
feature_extractor = pipeline(
task="feature-extraction", model="hf-internal-testing/tiny-random-distilbert", framework="tf"
)
# test with empty parameters
outputs = feature_extractor("This is a test")
self.assertEqual(
nested_simplify(outputs),
[[[2.287, 1.234, 0.042, 1.53, 1.306, 0.879, -0.526, -1.71, -1.276, 0.756, -0.775, -1.048, -0.25, -0.595, -0.137, -0.598, 2.022, -0.812, 0.284, -0.488, -0.391, -0.403, -0.525, -0.061, -0.228, 1.086, 0.378, -0.14, 0.599, -0.087, -2.259, -0.098], [1.676, 0.232, -1.508, -0.145, 1.798, -1.388, 1.331, -0.37, -0.939, 0.043, 0.06, -0.414, -1.408, 0.24, 0.622, -0.55, -0.569, 1.873, -0.706, 1.924, -0.254, 1.927, -0.423, 0.152, -0.952, 0.509, -0.496, -0.968, 0.093, -1.049, -0.65, 0.312], [0.207, -0.775, -1.822, 0.321, -0.71, -0.201, 0.3, 1.146, -0.233, -0.753, -0.305, 1.309, -1.47, -0.21, 1.802, -1.555, -1.175, 1.323, -0.303, 0.722, -0.076, 0.103, -1.406, 1.931, 0.091, 0.237, 1.172, 1.607, 0.253, -0.9, -1.068, 0.438], [0.615, 1.077, 0.171, -0.175, 1.3, 0.901, -0.653, -0.138, 0.341, -0.654, -0.184, -0.441, -0.424, 0.356, -0.075, 0.26, -1.023, 0.814, 0.524, -0.904, -0.204, -0.623, 1.234, -1.03, 2.594, 0.56, 1.831, -0.199, -1.508, -0.492, -1.687, -2.165], [0.129, 0.008, -1.279, -0.412, -0.004, 1.663, 0.196, 0.104, 0.123, 0.119, 0.635, 1.757, 2.334, -0.799, -1.626, -1.26, 0.595, -0.316, -1.399, 0.232, 0.264, 1.386, -1.171, -0.256, -0.256, -1.944, 1.168, -0.368, -0.714, -0.51, 0.454, 1.148], [-0.32, 0.29, -1.309, -0.177, 0.453, 0.636, -0.024, 0.509, 0.931, -1.754, -1.575, 0.786, 0.046, -1.165, -1.416, 1.373, 1.293, -0.285, -1.541, -1.186, -0.106, -0.994, 2.001, 0.972, -0.02, 1.654, -0.236, 0.643, 1.02, 0.572, -0.914, -0.154], [0.7, -0.937, 0.441, 0.25, 0.78, -0.022, 0.282, -0.095, 1.558, -0.336, 1.706, 0.884, 1.28, 0.198, -0.796, 1.218, -1.769, 1.197, -0.342, -0.177, -0.645, 1.364, 0.008, -0.597, -0.484, -2.772, -0.696, -0.632, -0.34, -1.527, -0.562, 0.862], [2.504, 0.831, -1.271, -0.033, 0.298, -0.735, 1.339, 1.74, 0.233, -1.424, -0.819, -0.761, 0.291, 0.853, -0.092, -0.885, 0.164, 1.025, 0.907, 0.749, -1.515, -0.545, -1.365, 0.271, 0.034, -2.005, 0.031, 0.244, 0.621, 0.176, 0.336, -1.196], [-0.711, 0.591, -1.001, -0.946, 0.784, -1.66, 1.545, 0.799, -0.857, 1.148, 0.213, -0.285, 0.464, -0.139, 0.79, -1.663, -1.121, 0.575, -0.178, -0.508, 1.565, -0.242, -0.346, 1.024, -1.135, -0.158, -2.101, 0.275, 2.009, -0.425, 0.716, 0.981], [0.912, -1.186, -0.846, -0.421, -1.315, -0.827, 0.309, 0.533, 1.029, -2.343, 1.513, -1.238, 1.487, -0.849, 0.896, -0.927, -0.459, 0.159, 0.177, 0.873, 0.935, 1.433, -0.485, 0.737, 1.327, -0.338, 1.608, -0.47, -0.445, -1.118, -0.213, -0.446], [-0.434, -1.362, -1.098, -1.068, 1.507, 0.003, 0.413, -0.395, 0.897, -0.237, 1.405, -0.344, 1.693, 0.677, 0.097, -0.257, -0.602, 1.026, -1.229, 0.855, -0.713, 1.014, 0.443, 0.238, 0.425, -2.184, 1.933, -1.157, -1.132, -0.597, -0.785, 0.967], [0.58, -0.971, 0.789, -0.468, -0.576, 1.779, 1.747, 1.715, -1.939, 0.125, 0.656, -0.042, -1.024, -1.767, 0.107, -0.408, -0.866, -1.774, 1.248, 0.939, -0.033, 1.523, 1.168, -0.744, 0.209, -0.168, -0.316, 0.207, -0.432, 0.047, -0.646, -0.664], [-0.185, -0.613, -1.695, 1.602, -0.32, -0.277, 0.967, 0.728, -0.965, -0.234, 1.069, -0.63, -1.631, 0.711, 0.426, 1.298, -0.191, -0.467, -0.771, 0.971, -0.118, -1.577, -2.064, -0.055, -0.59, 0.642, -0.997, 1.251, 0.538, 1.367, 0.106, 1.704]]]) # fmt: skip
# test with various tokenizer parameters
tokenize_kwargs = {"max_length": 3}
outputs = feature_extractor("This is a test", tokenize_kwargs=tokenize_kwargs)
self.assertEqual(np.squeeze(outputs).shape, (3, 32))
tokenize_kwargs = {"truncation": True, "padding": True, "max_length": 4}
outputs = feature_extractor(
["This is a test", "This", "This is", "This is a", "This is a test test test test"],
tokenize_kwargs=tokenize_kwargs,
)
self.assertEqual(np.squeeze(outputs).shape, (5, 4, 32))
tokenize_kwargs = {"padding": True, "max_length": 4}
outputs = feature_extractor(
["This is a test", "This", "This is", "This is a", "This is a test test test test"],
truncation=True,
tokenize_kwargs=tokenize_kwargs,
)
self.assertEqual(np.squeeze(outputs).shape, (5, 4, 32))
# raise value error if truncation parameter given for two places
tokenize_kwargs = {"truncation": True}
with self.assertRaises(ValueError):
_ = feature_extractor(
["This is a test", "This", "This is", "This is a", "This is a test test test test"],
truncation=True,
tokenize_kwargs=tokenize_kwargs,
)
@require_torch @require_torch
def test_return_tensors_pt(self): def test_return_tensors_pt(self):
feature_extractor = pipeline( feature_extractor = pipeline(
@ -150,14 +96,6 @@ class FeatureExtractionPipelineTests(unittest.TestCase):
outputs = feature_extractor("This is a test", return_tensors=True) outputs = feature_extractor("This is a test", return_tensors=True)
self.assertTrue(torch.is_tensor(outputs)) self.assertTrue(torch.is_tensor(outputs))
@require_tf
def test_return_tensors_tf(self):
feature_extractor = pipeline(
task="feature-extraction", model="hf-internal-testing/tiny-random-distilbert", framework="tf"
)
outputs = feature_extractor("This is a test", return_tensors=True)
self.assertTrue(tf.is_tensor(outputs))
def get_shape(self, input_, shape=None): def get_shape(self, input_, shape=None):
if shape is None: if shape is None:
shape = [] shape = []

View File

@ -22,7 +22,6 @@ from transformers.testing_utils import (
is_pipeline_test, is_pipeline_test,
is_torch_available, is_torch_available,
nested_simplify, nested_simplify,
require_tf,
require_torch, require_torch,
require_torch_accelerator, require_torch_accelerator,
slow, slow,
@ -44,47 +43,6 @@ class FillMaskPipelineTests(unittest.TestCase):
if is_torch_available(): if is_torch_available():
backend_empty_cache(torch_device) backend_empty_cache(torch_device)
@require_tf
def test_small_model_tf(self):
unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", top_k=2, framework="tf")
outputs = unmasker("My name is <mask>")
self.assertEqual(
nested_simplify(outputs, decimals=6),
[
{"sequence": "My name is grouped", "score": 2.1e-05, "token": 38015, "token_str": " grouped"},
{"sequence": "My name is accuser", "score": 2.1e-05, "token": 25506, "token_str": " accuser"},
],
)
outputs = unmasker("The largest city in France is <mask>")
self.assertEqual(
nested_simplify(outputs, decimals=6),
[
{
"sequence": "The largest city in France is grouped",
"score": 2.1e-05,
"token": 38015,
"token_str": " grouped",
},
{
"sequence": "The largest city in France is accuser",
"score": 2.1e-05,
"token": 25506,
"token_str": " accuser",
},
],
)
outputs = unmasker("My name is <mask>", targets=[" Patrick", " Clara", " Teven"], top_k=3)
self.assertEqual(
nested_simplify(outputs, decimals=6),
[
{"sequence": "My name is Clara", "score": 2e-05, "token": 13606, "token_str": " Clara"},
{"sequence": "My name is Patrick", "score": 2e-05, "token": 3499, "token_str": " Patrick"},
{"sequence": "My name is Te", "score": 1.9e-05, "token": 2941, "token_str": " Te"},
],
)
@require_torch @require_torch
def test_small_model_pt(self): def test_small_model_pt(self):
unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", top_k=2, framework="pt") unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", top_k=2, framework="pt")
@ -172,12 +130,6 @@ class FillMaskPipelineTests(unittest.TestCase):
unmasker = pipeline(task="fill-mask", model="distilbert/distilroberta-base", top_k=2, framework="pt") unmasker = pipeline(task="fill-mask", model="distilbert/distilroberta-base", top_k=2, framework="pt")
self.run_large_test(unmasker) self.run_large_test(unmasker)
@slow
@require_tf
def test_large_model_tf(self):
unmasker = pipeline(task="fill-mask", model="distilbert/distilroberta-base", top_k=2, framework="tf")
self.run_large_test(unmasker)
def run_large_test(self, unmasker): def run_large_test(self, unmasker):
outputs = unmasker("My name is <mask>") outputs = unmasker("My name is <mask>")
self.assertEqual( self.assertEqual(
@ -244,13 +196,6 @@ class FillMaskPipelineTests(unittest.TestCase):
unmasker.tokenizer.pad_token = None unmasker.tokenizer.pad_token = None
self.run_pipeline_test(unmasker, []) self.run_pipeline_test(unmasker, [])
@require_tf
def test_model_no_pad_tf(self):
unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", framework="tf")
unmasker.tokenizer.pad_token_id = None
unmasker.tokenizer.pad_token = None
self.run_pipeline_test(unmasker, [])
def get_test_pipeline( def get_test_pipeline(
self, self,
model, model,

View File

@ -29,7 +29,6 @@ from transformers.testing_utils import (
compare_pipeline_output_to_hub_spec, compare_pipeline_output_to_hub_spec,
is_pipeline_test, is_pipeline_test,
nested_simplify, nested_simplify,
require_tf,
require_torch, require_torch,
require_torch_or_tf, require_torch_or_tf,
require_vision, require_vision,
@ -175,32 +174,6 @@ class ImageClassificationPipelineTests(unittest.TestCase):
], ],
) )
@require_tf
def test_small_model_tf(self):
small_model = "hf-internal-testing/tiny-random-vit"
image_classifier = pipeline("image-classification", model=small_model, framework="tf")
outputs = image_classifier("http://images.cocodataset.org/val2017/000000039769.jpg")
self.assertEqual(
nested_simplify(outputs, decimals=4),
[{"label": "LABEL_1", "score": 0.574}, {"label": "LABEL_0", "score": 0.426}],
)
outputs = image_classifier(
[
"http://images.cocodataset.org/val2017/000000039769.jpg",
"http://images.cocodataset.org/val2017/000000039769.jpg",
],
top_k=2,
)
self.assertEqual(
nested_simplify(outputs, decimals=4),
[
[{"label": "LABEL_1", "score": 0.574}, {"label": "LABEL_0", "score": 0.426}],
[{"label": "LABEL_1", "score": 0.574}, {"label": "LABEL_0", "score": 0.426}],
],
)
def test_custom_tokenizer(self): def test_custom_tokenizer(self):
tokenizer = PreTrainedTokenizerBase() tokenizer = PreTrainedTokenizerBase()

View File

@ -22,20 +22,16 @@ from transformers import (
TF_MODEL_MAPPING, TF_MODEL_MAPPING,
TOKENIZER_MAPPING, TOKENIZER_MAPPING,
ImageFeatureExtractionPipeline, ImageFeatureExtractionPipeline,
is_tf_available,
is_torch_available, is_torch_available,
is_vision_available, is_vision_available,
pipeline, pipeline,
) )
from transformers.testing_utils import is_pipeline_test, nested_simplify, require_tf, require_torch from transformers.testing_utils import is_pipeline_test, nested_simplify, require_torch
if is_torch_available(): if is_torch_available():
import torch import torch
if is_tf_available():
import tensorflow as tf
if is_vision_available(): if is_vision_available():
from PIL import Image from PIL import Image
@ -73,28 +69,6 @@ class ImageFeatureExtractionPipelineTests(unittest.TestCase):
nested_simplify(outputs[0]), nested_simplify(outputs[0]),
[-0.056, 0.083, 0.021, 0.038, 0.242, -0.279, -0.033, -0.003, 0.200, -0.192, 0.045, -0.095, -0.077, 0.017, -0.058, -0.063, -0.029, -0.204, 0.014, 0.042, 0.305, -0.205, -0.099, 0.146, -0.287, 0.020, 0.168, -0.052, 0.046, 0.048, -0.156, 0.093]) # fmt: skip [-0.056, 0.083, 0.021, 0.038, 0.242, -0.279, -0.033, -0.003, 0.200, -0.192, 0.045, -0.095, -0.077, 0.017, -0.058, -0.063, -0.029, -0.204, 0.014, 0.042, 0.305, -0.205, -0.099, 0.146, -0.287, 0.020, 0.168, -0.052, 0.046, 0.048, -0.156, 0.093]) # fmt: skip
@require_tf
def test_small_model_tf(self):
feature_extractor = pipeline(
task="image-feature-extraction", model="hf-internal-testing/tiny-random-vit-w-pooler", framework="tf"
)
img = prepare_img()
outputs = feature_extractor(img)
self.assertEqual(
nested_simplify(outputs[0][0]),
[-1.417, -0.392, -1.264, -1.196, 1.648, 0.885, 0.56, -0.606, -1.175, 0.823, 1.912, 0.081, -0.053, 1.119, -0.062, -1.757, -0.571, 0.075, 0.959, 0.118, 1.201, -0.672, -0.498, 0.364, 0.937, -1.623, 0.228, 0.19, 1.697, -1.115, 0.583, -0.981]) # fmt: skip
@require_tf
def test_small_model_w_pooler_tf(self):
feature_extractor = pipeline(
task="image-feature-extraction", model="hf-internal-testing/tiny-random-vit-w-pooler", framework="tf"
)
img = prepare_img()
outputs = feature_extractor(img, pool=True)
self.assertEqual(
nested_simplify(outputs[0]),
[-0.056, 0.083, 0.021, 0.038, 0.242, -0.279, -0.033, -0.003, 0.200, -0.192, 0.045, -0.095, -0.077, 0.017, -0.058, -0.063, -0.029, -0.204, 0.014, 0.042, 0.305, -0.205, -0.099, 0.146, -0.287, 0.020, 0.168, -0.052, 0.046, 0.048, -0.156, 0.093]) # fmt: skip
@require_torch @require_torch
def test_image_processing_small_model_pt(self): def test_image_processing_small_model_pt(self):
feature_extractor = pipeline( feature_extractor = pipeline(
@ -117,28 +91,6 @@ class ImageFeatureExtractionPipelineTests(unittest.TestCase):
outputs = feature_extractor(img, pool=True) outputs = feature_extractor(img, pool=True)
self.assertEqual(np.squeeze(outputs).shape, (32,)) self.assertEqual(np.squeeze(outputs).shape, (32,))
@require_tf
def test_image_processing_small_model_tf(self):
feature_extractor = pipeline(
task="image-feature-extraction", model="hf-internal-testing/tiny-random-vit", framework="tf"
)
# test with image processor parameters
image_processor_kwargs = {"size": {"height": 300, "width": 300}}
img = prepare_img()
with pytest.raises(ValueError):
# Image doesn't match model input size
feature_extractor(img, image_processor_kwargs=image_processor_kwargs)
image_processor_kwargs = {"image_mean": [0, 0, 0], "image_std": [1, 1, 1]}
img = prepare_img()
outputs = feature_extractor(img, image_processor_kwargs=image_processor_kwargs)
self.assertEqual(np.squeeze(outputs).shape, (226, 32))
# Test pooling option
outputs = feature_extractor(img, pool=True)
self.assertEqual(np.squeeze(outputs).shape, (32,))
@require_torch @require_torch
def test_return_tensors_pt(self): def test_return_tensors_pt(self):
feature_extractor = pipeline( feature_extractor = pipeline(
@ -148,15 +100,6 @@ class ImageFeatureExtractionPipelineTests(unittest.TestCase):
outputs = feature_extractor(img, return_tensors=True) outputs = feature_extractor(img, return_tensors=True)
self.assertTrue(torch.is_tensor(outputs)) self.assertTrue(torch.is_tensor(outputs))
@require_tf
def test_return_tensors_tf(self):
feature_extractor = pipeline(
task="image-feature-extraction", model="hf-internal-testing/tiny-random-vit", framework="tf"
)
img = prepare_img()
outputs = feature_extractor(img, return_tensors=True)
self.assertTrue(tf.is_tensor(outputs))
def get_test_pipeline( def get_test_pipeline(
self, self,
model, model,

View File

@ -39,7 +39,6 @@ from transformers.testing_utils import (
compare_pipeline_output_to_hub_spec, compare_pipeline_output_to_hub_spec,
is_pipeline_test, is_pipeline_test,
nested_simplify, nested_simplify,
require_tf,
require_timm, require_timm,
require_torch, require_torch,
require_vision, require_vision,
@ -202,11 +201,6 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
for output_element in single_output: for output_element in single_output:
compare_pipeline_output_to_hub_spec(output_element, ImageSegmentationOutputElement) compare_pipeline_output_to_hub_spec(output_element, ImageSegmentationOutputElement)
@require_tf
@unittest.skip(reason="Image segmentation not implemented in TF")
def test_small_model_tf(self):
pass
@require_torch @require_torch
def test_small_model_pt_no_panoptic(self): def test_small_model_pt_no_panoptic(self):
model_id = "hf-internal-testing/tiny-random-mobilevit" model_id = "hf-internal-testing/tiny-random-mobilevit"

View File

@ -29,7 +29,6 @@ from transformers.testing_utils import (
Expectations, Expectations,
is_pipeline_test, is_pipeline_test,
nested_simplify, nested_simplify,
require_tf,
require_torch, require_torch,
require_vision, require_vision,
slow, slow,
@ -103,11 +102,6 @@ class MaskGenerationPipelineTests(unittest.TestCase):
def run_pipeline_test(self, mask_generator, examples): def run_pipeline_test(self, mask_generator, examples):
pass pass
@require_tf
@unittest.skip(reason="Image segmentation not implemented in TF")
def test_small_model_tf(self):
pass
@slow @slow
@require_torch @require_torch
def test_small_model_pt(self): def test_small_model_pt(self):

View File

@ -30,7 +30,6 @@ from transformers.testing_utils import (
is_pipeline_test, is_pipeline_test,
nested_simplify, nested_simplify,
require_pytesseract, require_pytesseract,
require_tf,
require_timm, require_timm,
require_torch, require_torch,
require_vision, require_vision,
@ -128,11 +127,6 @@ class ObjectDetectionPipelineTests(unittest.TestCase):
) )
compare_pipeline_output_to_hub_spec(detected_object, ObjectDetectionOutputElement) compare_pipeline_output_to_hub_spec(detected_object, ObjectDetectionOutputElement)
@require_tf
@unittest.skip(reason="Object detection not implemented in TF")
def test_small_model_tf(self):
pass
@require_torch @require_torch
def test_small_model_pt(self): def test_small_model_pt(self):
model_id = "hf-internal-testing/tiny-detr-mobilenetsv3" model_id = "hf-internal-testing/tiny-detr-mobilenetsv3"

View File

@ -29,7 +29,6 @@ from transformers.testing_utils import (
is_pipeline_test, is_pipeline_test,
is_torch_available, is_torch_available,
nested_simplify, nested_simplify,
require_tf,
require_torch, require_torch,
require_torch_or_tf, require_torch_or_tf,
slow, slow,
@ -296,17 +295,6 @@ class QAPipelineTests(unittest.TestCase):
answers = [output["answer"] for output in outputs] answers = [output["answer"] for output in outputs]
self.assertEqual(len(answers), len(set(answers)), "There are duplicate answers in the outputs.") self.assertEqual(len(answers), len(set(answers)), "There are duplicate answers in the outputs.")
@require_tf
def test_small_model_tf(self):
question_answerer = pipeline(
"question-answering", model="sshleifer/tiny-distilbert-base-cased-distilled-squad", framework="tf"
)
outputs = question_answerer(
question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris."
)
self.assertEqual(nested_simplify(outputs), {"score": 0.011, "start": 0, "end": 11, "answer": "HuggingFace"})
@slow @slow
@require_torch @require_torch
def test_large_model_pt(self): def test_large_model_pt(self):
@ -421,16 +409,6 @@ between them. It's straightforward to train your models with one before loading
{"answer": "Jax, PyTorch and TensorFlow", "end": 1919, "score": 0.971, "start": 1892}, {"answer": "Jax, PyTorch and TensorFlow", "end": 1919, "score": 0.971, "start": 1892},
) )
@slow
@require_tf
def test_large_model_tf(self):
question_answerer = pipeline("question-answering", framework="tf")
outputs = question_answerer(
question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris."
)
self.assertEqual(nested_simplify(outputs), {"score": 0.979, "start": 27, "end": 32, "answer": "Paris"})
@require_torch_or_tf @require_torch_or_tf
class QuestionAnsweringArgumentHandlerTests(unittest.TestCase): class QuestionAnsweringArgumentHandlerTests(unittest.TestCase):

View File

@ -26,7 +26,6 @@ from transformers.testing_utils import (
is_pipeline_test, is_pipeline_test,
require_pandas, require_pandas,
require_tensorflow_probability, require_tensorflow_probability,
require_tf,
require_torch, require_torch,
slow, slow,
) )
@ -38,111 +37,6 @@ class TQAPipelineTests(unittest.TestCase):
# which are needed to generate automatic tests # which are needed to generate automatic tests
model_mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING model_mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING
@require_tensorflow_probability
@require_pandas
@require_tf
@require_torch
def test_small_model_tf(self):
model_id = "lysandre/tiny-tapas-random-wtq"
model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id, from_pt=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
self.assertIsInstance(model.config.aggregation_labels, dict)
self.assertIsInstance(model.config.no_aggregation_label_index, int)
table_querier = TableQuestionAnsweringPipeline(model=model, tokenizer=tokenizer, max_new_tokens=20)
outputs = table_querier(
table={
"actors": ["brad pitt", "leonardo di caprio", "george clooney"],
"age": ["56", "45", "59"],
"number of movies": ["87", "53", "69"],
"date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
},
query="how many movies has george clooney played in?",
)
self.assertEqual(
outputs,
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
)
outputs = table_querier(
table={
"actors": ["brad pitt", "leonardo di caprio", "george clooney"],
"age": ["56", "45", "59"],
"number of movies": ["87", "53", "69"],
"date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
},
query=["how many movies has george clooney played in?", "how old is he?", "what's his date of birth?"],
)
self.assertEqual(
outputs,
[
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
],
)
outputs = table_querier(
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
},
query=[
"What repository has the largest number of stars?",
"Given that the numbers of stars defines if a repository is active, what repository is the most"
" active?",
"What is the number of repositories?",
"What is the average number of stars?",
"What is the total amount of stars?",
],
)
self.assertEqual(
outputs,
[
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
],
)
with self.assertRaises(ValueError):
table_querier(query="What does it do with empty context ?", table=None)
with self.assertRaises(ValueError):
table_querier(query="What does it do with empty context ?", table="")
with self.assertRaises(ValueError):
table_querier(query="What does it do with empty context ?", table={})
with self.assertRaises(ValueError):
table_querier(
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
}
)
with self.assertRaises(ValueError):
table_querier(
query="",
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
},
)
with self.assertRaises(ValueError):
table_querier(
query=None,
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
},
)
@require_torch @require_torch
def test_small_model_pt(self, torch_dtype="float32"): def test_small_model_pt(self, torch_dtype="float32"):
model_id = "lysandre/tiny-tapas-random-wtq" model_id = "lysandre/tiny-tapas-random-wtq"
@ -372,128 +266,6 @@ class TQAPipelineTests(unittest.TestCase):
def test_slow_tokenizer_sqa_pt_fp16(self): def test_slow_tokenizer_sqa_pt_fp16(self):
self.test_slow_tokenizer_sqa_pt(torch_dtype="float16") self.test_slow_tokenizer_sqa_pt(torch_dtype="float16")
@require_tf
@require_tensorflow_probability
@require_pandas
@require_torch
def test_slow_tokenizer_sqa_tf(self):
model_id = "lysandre/tiny-tapas-random-sqa"
model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id, from_pt=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
table_querier = TableQuestionAnsweringPipeline(model=model, tokenizer=tokenizer, max_new_tokens=20)
inputs = {
"table": {
"actors": ["brad pitt", "leonardo di caprio", "george clooney"],
"age": ["56", "45", "59"],
"number of movies": ["87", "53", "69"],
"date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
},
"query": ["how many movies has george clooney played in?", "how old is he?", "what's his date of birth?"],
}
sequential_outputs = table_querier(**inputs, sequential=True)
batch_outputs = table_querier(**inputs, sequential=False)
self.assertEqual(len(sequential_outputs), 3)
self.assertEqual(len(batch_outputs), 3)
self.assertEqual(sequential_outputs[0], batch_outputs[0])
self.assertNotEqual(sequential_outputs[1], batch_outputs[1])
# self.assertNotEqual(sequential_outputs[2], batch_outputs[2])
table_querier = TableQuestionAnsweringPipeline(model=model, tokenizer=tokenizer, max_new_tokens=20)
outputs = table_querier(
table={
"actors": ["brad pitt", "leonardo di caprio", "george clooney"],
"age": ["56", "45", "59"],
"number of movies": ["87", "53", "69"],
"date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
},
query="how many movies has george clooney played in?",
)
self.assertEqual(
outputs,
{"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]},
)
outputs = table_querier(
table={
"actors": ["brad pitt", "leonardo di caprio", "george clooney"],
"age": ["56", "45", "59"],
"number of movies": ["87", "53", "69"],
"date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
},
query=["how many movies has george clooney played in?", "how old is he?", "what's his date of birth?"],
)
self.assertEqual(
outputs,
[
{"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]},
{"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]},
{"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]},
],
)
outputs = table_querier(
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
},
query=[
"What repository has the largest number of stars?",
"Given that the numbers of stars defines if a repository is active, what repository is the most"
" active?",
"What is the number of repositories?",
"What is the average number of stars?",
"What is the total amount of stars?",
],
)
self.assertEqual(
outputs,
[
{"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
{"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
{"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
{"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
{"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
],
)
with self.assertRaises(ValueError):
table_querier(query="What does it do with empty context ?", table=None)
with self.assertRaises(ValueError):
table_querier(query="What does it do with empty context ?", table="")
with self.assertRaises(ValueError):
table_querier(query="What does it do with empty context ?", table={})
with self.assertRaises(ValueError):
table_querier(
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
}
)
with self.assertRaises(ValueError):
table_querier(
query="",
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
},
)
with self.assertRaises(ValueError):
table_querier(
query=None,
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
},
)
@slow @slow
@require_torch @require_torch
def test_integration_wtq_pt(self, torch_dtype="float32"): def test_integration_wtq_pt(self, torch_dtype="float32"):

View File

@ -24,7 +24,6 @@ from transformers.testing_utils import (
is_pipeline_test, is_pipeline_test,
is_torch_available, is_torch_available,
nested_simplify, nested_simplify,
require_tf,
require_torch, require_torch,
require_torch_bf16, require_torch_bf16,
require_torch_fp16, require_torch_fp16,
@ -152,15 +151,6 @@ class TextClassificationPipelineTests(unittest.TestCase):
outputs = text_classifier("This is great !") outputs = text_classifier("This is great !")
self.assertEqual(nested_simplify(outputs), [{"label": "LABEL_0", "score": 0.504}]) self.assertEqual(nested_simplify(outputs), [{"label": "LABEL_0", "score": 0.504}])
@require_tf
def test_small_model_tf(self):
text_classifier = pipeline(
task="text-classification", model="hf-internal-testing/tiny-random-distilbert", framework="tf"
)
outputs = text_classifier("This is great !")
self.assertEqual(nested_simplify(outputs), [{"label": "LABEL_0", "score": 0.504}])
@slow @slow
@require_torch @require_torch
def test_pt_bert(self): def test_pt_bert(self):
@ -173,18 +163,6 @@ class TextClassificationPipelineTests(unittest.TestCase):
outputs = text_classifier("Birds are a type of animal") outputs = text_classifier("Birds are a type of animal")
self.assertEqual(nested_simplify(outputs), [{"label": "POSITIVE", "score": 0.988}]) self.assertEqual(nested_simplify(outputs), [{"label": "POSITIVE", "score": 0.988}])
@slow
@require_tf
def test_tf_bert(self):
text_classifier = pipeline("text-classification", framework="tf")
outputs = text_classifier("This is great !")
self.assertEqual(nested_simplify(outputs), [{"label": "POSITIVE", "score": 1.0}])
outputs = text_classifier("This is bad !")
self.assertEqual(nested_simplify(outputs), [{"label": "NEGATIVE", "score": 1.0}])
outputs = text_classifier("Birds are a type of animal")
self.assertEqual(nested_simplify(outputs), [{"label": "POSITIVE", "score": 0.988}])
def get_test_pipeline( def get_test_pipeline(
self, self,
model, model,

View File

@ -29,7 +29,6 @@ from transformers.testing_utils import (
is_pipeline_test, is_pipeline_test,
is_torch_available, is_torch_available,
nested_simplify, nested_simplify,
require_tf,
require_torch, require_torch,
require_torch_accelerator, require_torch_accelerator,
slow, slow,
@ -823,26 +822,6 @@ class TokenClassificationPipelineTests(unittest.TestCase):
[("▁I", False), ("▁play", False), ("▁the", False), ("▁there", False), ("min", True)], [("▁I", False), ("▁play", False), ("▁the", False), ("▁there", False), ("min", True)],
) )
@require_tf
def test_tf_only(self):
model_name = "hf-internal-testing/tiny-random-bert-tf-only" # This model only has a TensorFlow version
# We test that if we don't specify framework='tf', it gets detected automatically
token_classifier = pipeline(task="ner", model=model_name)
self.assertEqual(token_classifier.framework, "tf")
@require_tf
def test_small_model_tf(self):
model_name = "hf-internal-testing/tiny-bert-for-token-classification"
token_classifier = pipeline(task="token-classification", model=model_name, framework="tf")
outputs = token_classifier("This is a test !")
self.assertEqual(
nested_simplify(outputs),
[
{"entity": "I-MISC", "score": 0.115, "index": 1, "word": "this", "start": 0, "end": 4},
{"entity": "I-MISC", "score": 0.115, "index": 2, "word": "is", "start": 5, "end": 7},
],
)
@require_torch @require_torch
def test_no_offset_tokenizer(self): def test_no_offset_tokenizer(self):
model_name = "hf-internal-testing/tiny-bert-for-token-classification" model_name = "hf-internal-testing/tiny-bert-for-token-classification"

View File

@ -23,7 +23,6 @@ from transformers.testing_utils import (
is_pipeline_test, is_pipeline_test,
nested_simplify, nested_simplify,
require_av, require_av,
require_tf,
require_torch, require_torch,
require_torch_or_tf, require_torch_or_tf,
require_vision, require_vision,
@ -124,8 +123,3 @@ class VideoClassificationPipelineTests(unittest.TestCase):
for output in outputs: for output in outputs:
for element in output: for element in output:
compare_pipeline_output_to_hub_spec(element, VideoClassificationOutputElement) compare_pipeline_output_to_hub_spec(element, VideoClassificationOutputElement)
@require_tf
@unittest.skip
def test_small_model_tf(self):
pass

View File

@ -22,7 +22,6 @@ from transformers.testing_utils import (
is_pipeline_test, is_pipeline_test,
is_torch_available, is_torch_available,
nested_simplify, nested_simplify,
require_tf,
require_torch, require_torch,
require_torch_accelerator, require_torch_accelerator,
require_vision, require_vision,
@ -246,8 +245,3 @@ class VisualQuestionAnsweringPipelineTests(unittest.TestCase):
[{"score": ANY(float), "answer": ANY(str)}], [{"score": ANY(float), "answer": ANY(str)}],
], ],
) )
@require_tf
@unittest.skip(reason="Visual question answering not implemented in TF")
def test_small_model_tf(self):
pass

View File

@ -25,7 +25,6 @@ from transformers.testing_utils import (
is_pipeline_test, is_pipeline_test,
is_torch_available, is_torch_available,
nested_simplify, nested_simplify,
require_tf,
require_torch, require_torch,
slow, slow,
) )
@ -243,26 +242,6 @@ class ZeroShotClassificationPipelineTests(unittest.TestCase):
}, },
) )
@require_tf
def test_small_model_tf(self):
zero_shot_classifier = pipeline(
"zero-shot-classification",
model="sshleifer/tiny-distilbert-base-cased-distilled-squad",
framework="tf",
)
outputs = zero_shot_classifier(
"Who are you voting for in 2020?", candidate_labels=["politics", "public health", "science"]
)
self.assertEqual(
nested_simplify(outputs),
{
"sequence": "Who are you voting for in 2020?",
"labels": ["science", "public health", "politics"],
"scores": [0.333, 0.333, 0.333],
},
)
@slow @slow
@require_torch @require_torch
def test_large_model_pt(self): def test_large_model_pt(self):
@ -319,60 +298,3 @@ class ZeroShotClassificationPipelineTests(unittest.TestCase):
"scores": [0.817, 0.713, 0.018, 0.018], "scores": [0.817, 0.713, 0.018, 0.018],
}, },
) )
@slow
@require_tf
def test_large_model_tf(self):
zero_shot_classifier = pipeline(
"zero-shot-classification", model="FacebookAI/roberta-large-mnli", framework="tf"
)
outputs = zero_shot_classifier(
"Who are you voting for in 2020?", candidate_labels=["politics", "public health", "science"]
)
self.assertEqual(
nested_simplify(outputs),
{
"sequence": "Who are you voting for in 2020?",
"labels": ["politics", "public health", "science"],
"scores": [0.976, 0.015, 0.009],
},
)
outputs = zero_shot_classifier(
"The dominant sequence transduction models are based on complex recurrent or convolutional neural networks"
" in an encoder-decoder configuration. The best performing models also connect the encoder and decoder"
" through an attention mechanism. We propose a new simple network architecture, the Transformer, based"
" solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two"
" machine translation tasks show these models to be superior in quality while being more parallelizable"
" and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014"
" English-to-German translation task, improving over the existing best results, including ensembles by"
" over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new"
" single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small"
" fraction of the training costs of the best models from the literature. We show that the Transformer"
" generalizes well to other tasks by applying it successfully to English constituency parsing both with"
" large and limited training data.",
candidate_labels=["machine learning", "statistics", "translation", "vision"],
multi_label=True,
)
self.assertEqual(
nested_simplify(outputs),
{
"sequence": (
"The dominant sequence transduction models are based on complex recurrent or convolutional neural"
" networks in an encoder-decoder configuration. The best performing models also connect the"
" encoder and decoder through an attention mechanism. We propose a new simple network"
" architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence"
" and convolutions entirely. Experiments on two machine translation tasks show these models to be"
" superior in quality while being more parallelizable and requiring significantly less time to"
" train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task,"
" improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014"
" English-to-French translation task, our model establishes a new single-model state-of-the-art"
" BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training"
" costs of the best models from the literature. We show that the Transformer generalizes well to"
" other tasks by applying it successfully to English constituency parsing both with large and"
" limited training data."
),
"labels": ["translation", "machine learning", "vision", "statistics"],
"scores": [0.817, 0.713, 0.018, 0.018],
},
)

View File

@ -22,7 +22,6 @@ from transformers.testing_utils import (
compare_pipeline_output_to_hub_spec, compare_pipeline_output_to_hub_spec,
is_pipeline_test, is_pipeline_test,
nested_simplify, nested_simplify,
require_tf,
require_torch, require_torch,
require_vision, require_vision,
slow, slow,
@ -137,57 +136,6 @@ class ZeroShotImageClassificationPipelineTests(unittest.TestCase):
def test_small_model_pt_fp16(self): def test_small_model_pt_fp16(self):
self.test_small_model_pt(torch_dtype="float16") self.test_small_model_pt(torch_dtype="float16")
@require_tf
def test_small_model_tf(self):
image_classifier = pipeline(
model="hf-internal-testing/tiny-random-clip-zero-shot-image-classification", framework="tf"
)
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
output = image_classifier(image, candidate_labels=["a", "b", "c"])
self.assertEqual(
nested_simplify(output),
[{"score": 0.333, "label": "a"}, {"score": 0.333, "label": "b"}, {"score": 0.333, "label": "c"}],
)
output = image_classifier([image] * 5, candidate_labels=["A", "B", "C"], batch_size=2)
self.assertEqual(
nested_simplify(output),
# Pipeline outputs are supposed to be deterministic and
# So we could in theory have real values "A", "B", "C" instead
# of ANY(str).
# However it seems that in this particular case, the floating
# scores are so close, we enter floating error approximation
# and the order is not guaranteed anymore with batching.
[
[
{"score": 0.333, "label": ANY(str)},
{"score": 0.333, "label": ANY(str)},
{"score": 0.333, "label": ANY(str)},
],
[
{"score": 0.333, "label": ANY(str)},
{"score": 0.333, "label": ANY(str)},
{"score": 0.333, "label": ANY(str)},
],
[
{"score": 0.333, "label": ANY(str)},
{"score": 0.333, "label": ANY(str)},
{"score": 0.333, "label": ANY(str)},
],
[
{"score": 0.333, "label": ANY(str)},
{"score": 0.333, "label": ANY(str)},
{"score": 0.333, "label": ANY(str)},
],
[
{"score": 0.333, "label": ANY(str)},
{"score": 0.333, "label": ANY(str)},
{"score": 0.333, "label": ANY(str)},
],
],
)
@slow @slow
@require_torch @require_torch
def test_large_model_pt(self): def test_large_model_pt(self):
@ -221,37 +169,6 @@ class ZeroShotImageClassificationPipelineTests(unittest.TestCase):
* 5, * 5,
) )
@slow
@require_tf
def test_large_model_tf(self):
image_classifier = pipeline(
task="zero-shot-image-classification", model="openai/clip-vit-base-patch32", framework="tf"
)
# This is an image of 2 cats with remotes and no planes
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
output = image_classifier(image, candidate_labels=["cat", "plane", "remote"])
self.assertEqual(
nested_simplify(output),
[
{"score": 0.511, "label": "remote"},
{"score": 0.485, "label": "cat"},
{"score": 0.004, "label": "plane"},
],
)
output = image_classifier([image] * 5, candidate_labels=["cat", "plane", "remote"], batch_size=2)
self.assertEqual(
nested_simplify(output),
[
[
{"score": 0.511, "label": "remote"},
{"score": 0.485, "label": "cat"},
{"score": 0.004, "label": "plane"},
],
]
* 5,
)
@slow @slow
@require_torch @require_torch
def test_siglip_model_pt(self): def test_siglip_model_pt(self):

View File

@ -23,7 +23,6 @@ from transformers import (
from transformers.testing_utils import ( from transformers.testing_utils import (
is_pipeline_test, is_pipeline_test,
nested_simplify, nested_simplify,
require_tf,
require_torch, require_torch,
require_vision, require_vision,
slow, slow,
@ -90,11 +89,6 @@ class ZeroShotObjectDetectionPipelineTests(unittest.TestCase):
], ],
) )
@require_tf
@unittest.skip(reason="Zero Shot Object Detection not implemented in TF")
def test_small_model_tf(self):
pass
@require_torch @require_torch
def test_small_model_pt(self): def test_small_model_pt(self):
object_detector = pipeline( object_detector = pipeline(
@ -201,11 +195,6 @@ class ZeroShotObjectDetectionPipelineTests(unittest.TestCase):
], ],
) )
@require_tf
@unittest.skip(reason="Zero Shot Object Detection not implemented in TF")
def test_large_model_tf(self):
pass
@require_torch @require_torch
@slow @slow
def test_threshold(self): def test_threshold(self):

View File

@ -17,16 +17,13 @@ import unittest
import numpy as np import numpy as np
from parameterized import parameterized from parameterized import parameterized
from transformers.testing_utils import require_flax, require_tf, require_torch, require_vision from transformers.testing_utils import require_flax, require_torch, require_vision
from transformers.utils.import_utils import is_flax_available, is_tf_available, is_torch_available, is_vision_available from transformers.utils.import_utils import is_flax_available, is_torch_available, is_vision_available
if is_torch_available(): if is_torch_available():
import torch import torch
if is_tf_available():
import tensorflow as tf
if is_flax_available(): if is_flax_available():
import jax import jax
@ -122,20 +119,6 @@ class ImageTransformsTester(unittest.TestCase):
self.assertTrue(np_img.min() == 0) self.assertTrue(np_img.min() == 0)
self.assertTrue(np_img.max() == 1) self.assertTrue(np_img.max() == 1)
@require_tf
def test_to_pil_image_from_tensorflow(self):
# channels_first
image = tf.random.uniform((3, 4, 5))
pil_image = to_pil_image(image)
self.assertIsInstance(pil_image, PIL.Image.Image)
self.assertEqual(pil_image.size, (5, 4))
# channels_last
image = tf.random.uniform((4, 5, 3))
pil_image = to_pil_image(image)
self.assertIsInstance(pil_image, PIL.Image.Image)
self.assertEqual(pil_image.size, (5, 4))
@require_torch @require_torch
def test_to_pil_image_from_torch(self): def test_to_pil_image_from_torch(self):
# channels first # channels first

View File

@ -16,7 +16,7 @@
import numpy as np import numpy as np
from transformers import BatchFeature from transformers import BatchFeature
from transformers.testing_utils import require_tf, require_torch from transformers.testing_utils import require_torch
from .test_feature_extraction_common import FeatureExtractionSavingTestMixin from .test_feature_extraction_common import FeatureExtractionSavingTestMixin
@ -76,24 +76,6 @@ class SequenceFeatureExtractionTestMixin(FeatureExtractionSavingTestMixin):
== (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.feature_size) == (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.feature_size)
) )
@require_tf
def test_batch_feature_tf(self):
speech_inputs = self.feat_extract_tester.prepare_inputs_for_common(equal_length=True)
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
input_name = feat_extract.model_input_names[0]
processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="tf")
batch_features_input = processed_features[input_name]
if len(batch_features_input.shape) < 3:
batch_features_input = batch_features_input[:, :, None]
self.assertTrue(
batch_features_input.shape
== (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.feature_size)
)
def _check_padding(self, numpify=False): def _check_padding(self, numpify=False):
def _inputs_have_equal_length(input): def _inputs_have_equal_length(input):
length = len(input[0]) length = len(input[0])
@ -372,19 +354,6 @@ class SequenceFeatureExtractionTestMixin(FeatureExtractionSavingTestMixin):
self.assertTrue(abs(input_np.astype(np.float32).sum() - input_pt.numpy().astype(np.float32).sum()) < 1e-2) self.assertTrue(abs(input_np.astype(np.float32).sum() - input_pt.numpy().astype(np.float32).sum()) < 1e-2)
@require_tf
def test_padding_accepts_tensors_tf(self):
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
speech_inputs = self.feat_extract_tester.prepare_inputs_for_common()
input_name = feat_extract.model_input_names[0]
processed_features = BatchFeature({input_name: speech_inputs})
input_np = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name]
input_tf = feat_extract.pad(processed_features, padding="longest", return_tensors="tf")[input_name]
self.assertTrue(abs(input_np.astype(np.float32).sum() - input_tf.numpy().astype(np.float32).sum()) < 1e-2)
def test_attention_mask(self): def test_attention_mask(self):
feat_dict = self.feat_extract_dict feat_dict = self.feat_extract_dict
feat_dict["return_attention_mask"] = True feat_dict["return_attention_mask"] = True

View File

@ -53,7 +53,6 @@ from transformers.testing_utils import (
get_tests_dir, get_tests_dir,
require_jinja, require_jinja,
require_read_token, require_read_token,
require_tf,
require_tokenizers, require_tokenizers,
require_torch, require_torch,
run_test_in_subprocess, run_test_in_subprocess,
@ -3106,40 +3105,6 @@ class TokenizerTesterMixin:
# model(**encoded_sequence_fast) # model(**encoded_sequence_fast)
# model(**batch_encoded_sequence_fast) # model(**batch_encoded_sequence_fast)
@require_tf
@slow
def test_tf_encode_plus_sent_to_model(self):
from transformers import TF_MODEL_MAPPING, TOKENIZER_MAPPING
MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(TF_MODEL_MAPPING, TOKENIZER_MAPPING)
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
self.skipTest(f"{tokenizer.__class__.__name__} is not in the MODEL_TOKENIZER_MAPPING")
config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
config = config_class()
if config.is_encoder_decoder or config.pad_token_id is None:
self.skipTest(reason="Model is not an encoder-decoder model or has no set pad token id")
model = model_class(config)
# Make sure the model contains at least the full vocabulary size in its embedding matrix
self.assertGreaterEqual(model.config.vocab_size, len(tokenizer))
# Build sequence
first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
sequence = " ".join(first_ten_tokens)
encoded_sequence = tokenizer.encode_plus(sequence, return_tensors="tf")
batch_encoded_sequence = tokenizer.batch_encode_plus([sequence, sequence], return_tensors="tf")
# This should not fail
model(encoded_sequence)
model(batch_encoded_sequence)
# TODO: Check if require_torch is the best to test for numpy here ... Maybe move to require_flax when available # TODO: Check if require_torch is the best to test for numpy here ... Maybe move to require_flax when available
@require_torch @require_torch
@slow @slow

View File

@ -39,7 +39,6 @@ from transformers.testing_utils import (
CaptureStderr, CaptureStderr,
require_flax, require_flax,
require_sentencepiece, require_sentencepiece,
require_tf,
require_tokenizers, require_tokenizers,
require_torch, require_torch,
slow, slow,
@ -121,27 +120,6 @@ class TokenizerUtilsTest(unittest.TestCase):
tokenizer_r("Small example to encode", return_tensors=TensorType.NUMPY), np.array_equal tokenizer_r("Small example to encode", return_tensors=TensorType.NUMPY), np.array_equal
) )
@require_tf
@require_tokenizers
def test_batch_encoding_pickle_tf(self):
import tensorflow as tf
def tf_array_equals(t1, t2):
return tf.reduce_all(tf.equal(t1, t2))
tokenizer_p = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
tokenizer_r = BertTokenizerFast.from_pretrained("google-bert/bert-base-cased")
with self.subTest("BatchEncoding (Python, return_tensors=TENSORFLOW)"):
self.assert_dump_and_restore(
tokenizer_p("Small example to encode", return_tensors=TensorType.TENSORFLOW), tf_array_equals
)
with self.subTest("BatchEncoding (Rust, return_tensors=TENSORFLOW)"):
self.assert_dump_and_restore(
tokenizer_r("Small example to encode", return_tensors=TensorType.TENSORFLOW), tf_array_equals
)
@require_torch @require_torch
@require_tokenizers @require_tokenizers
def test_batch_encoding_pickle_pt(self): def test_batch_encoding_pickle_pt(self):
@ -211,22 +189,6 @@ class TokenizerUtilsTest(unittest.TestCase):
self.assertEqual(tensor_batch["inputs"].shape, (1, 3)) self.assertEqual(tensor_batch["inputs"].shape, (1, 3))
self.assertEqual(tensor_batch["labels"].shape, (1,)) self.assertEqual(tensor_batch["labels"].shape, (1,))
@require_tf
def test_batch_encoding_with_labels_tf(self):
batch = BatchEncoding({"inputs": [[1, 2, 3], [4, 5, 6]], "labels": [0, 1]})
tensor_batch = batch.convert_to_tensors(tensor_type="tf")
self.assertEqual(tensor_batch["inputs"].shape, (2, 3))
self.assertEqual(tensor_batch["labels"].shape, (2,))
# test converting the converted
with CaptureStderr() as cs:
tensor_batch = batch.convert_to_tensors(tensor_type="tf")
self.assertFalse(len(cs.err), msg=f"should have no warning, but got {cs.err}")
batch = BatchEncoding({"inputs": [1, 2, 3], "labels": 0})
tensor_batch = batch.convert_to_tensors(tensor_type="tf", prepend_batch_axis=True)
self.assertEqual(tensor_batch["inputs"].shape, (1, 3))
self.assertEqual(tensor_batch["labels"].shape, (1,))
@require_flax @require_flax
def test_batch_encoding_with_labels_jax(self): def test_batch_encoding_with_labels_jax(self):
batch = BatchEncoding({"inputs": [[1, 2, 3], [4, 5, 6]], "labels": [0, 1]}) batch = BatchEncoding({"inputs": [[1, 2, 3], [4, 5, 6]], "labels": [0, 1]})
@ -381,20 +343,6 @@ class TokenizerUtilsTest(unittest.TestCase):
self.assertTrue(isinstance(batch["input_ids"], torch.Tensor)) self.assertTrue(isinstance(batch["input_ids"], torch.Tensor))
self.assertEqual(batch["input_ids"].tolist(), [[0, 1, 2, tokenizer.pad_token_id], [0, 1, 2, 3]]) self.assertEqual(batch["input_ids"].tolist(), [[0, 1, 2, tokenizer.pad_token_id], [0, 1, 2, 3]])
@require_tf
def test_padding_accepts_tensors_tf(self):
import tensorflow as tf
features = [{"input_ids": tf.constant([0, 1, 2])}, {"input_ids": tf.constant([0, 1, 2, 3])}]
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
batch = tokenizer.pad(features, padding=True)
self.assertTrue(isinstance(batch["input_ids"], tf.Tensor))
self.assertEqual(batch["input_ids"].numpy().tolist(), [[0, 1, 2, tokenizer.pad_token_id], [0, 1, 2, 3]])
batch = tokenizer.pad(features, padding=True, return_tensors="tf")
self.assertTrue(isinstance(batch["input_ids"], tf.Tensor))
self.assertEqual(batch["input_ids"].numpy().tolist(), [[0, 1, 2, tokenizer.pad_token_id], [0, 1, 2, 3]])
@require_tokenizers @require_tokenizers
def test_instantiation_from_tokenizers(self): def test_instantiation_from_tokenizers(self):
bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

View File

@ -29,20 +29,16 @@ from transformers import (
DataCollatorWithFlattening, DataCollatorWithFlattening,
DataCollatorWithPadding, DataCollatorWithPadding,
default_data_collator, default_data_collator,
is_tf_available,
is_torch_available, is_torch_available,
set_seed, set_seed,
) )
from transformers.testing_utils import require_tf, require_torch from transformers.testing_utils import require_torch
from transformers.utils import PaddingStrategy from transformers.utils import PaddingStrategy
if is_torch_available(): if is_torch_available():
import torch import torch
if is_tf_available():
import tensorflow as tf
@require_torch @require_torch
class DataCollatorIntegrationTest(unittest.TestCase): class DataCollatorIntegrationTest(unittest.TestCase):
@ -1022,795 +1018,6 @@ class DataCollatorImmutabilityTest(unittest.TestCase):
) )
@require_tf
class TFDataCollatorIntegrationTest(unittest.TestCase):
def setUp(self):
super().setUp()
self.tmpdirname = tempfile.mkdtemp()
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
self.vocab_file = os.path.join(self.tmpdirname, "vocab.txt")
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def test_default_with_dict(self):
features = [{"label": i, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
batch = default_data_collator(features, return_tensors="tf")
self.assertEqual(batch["labels"].numpy().tolist(), list(range(8)))
self.assertEqual(batch["labels"].dtype, tf.int64)
self.assertEqual(batch["inputs"].shape.as_list(), [8, 6])
# With label_ids
features = [{"label_ids": [0, 1, 2], "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
batch = default_data_collator(features, return_tensors="tf")
self.assertEqual(batch["labels"].numpy().tolist(), ([[0, 1, 2]] * 8))
self.assertEqual(batch["labels"].dtype, tf.int64)
self.assertEqual(batch["inputs"].shape.as_list(), [8, 6])
# Features can already be tensors
features = [{"label": i, "inputs": np.random.randint(0, 10, [10])} for i in range(8)]
batch = default_data_collator(features, return_tensors="tf")
self.assertEqual(batch["labels"].numpy().tolist(), (list(range(8))))
self.assertEqual(batch["labels"].dtype, tf.int64)
self.assertEqual(batch["inputs"].shape.as_list(), [8, 10])
# Labels can already be tensors
features = [{"label": np.array(i), "inputs": np.random.randint(0, 10, [10])} for i in range(8)]
batch = default_data_collator(features, return_tensors="tf")
self.assertEqual(batch["labels"].dtype, tf.int64)
self.assertEqual(batch["labels"].numpy().tolist(), list(range(8)))
self.assertEqual(batch["labels"].dtype, tf.int64)
self.assertEqual(batch["inputs"].shape.as_list(), [8, 10])
def test_numpy_dtype_preservation(self):
data_collator = default_data_collator
# Confirms that numpy inputs are handled correctly even when scalars
features = [{"input_ids": np.array([0, 1, 2, 3, 4]), "label": np.int64(i)} for i in range(4)]
batch = data_collator(features, return_tensors="tf")
self.assertEqual(batch["labels"].dtype, tf.int64)
def test_default_classification_and_regression(self):
data_collator = default_data_collator
features = [{"input_ids": [0, 1, 2, 3, 4], "label": i} for i in range(4)]
batch = data_collator(features, return_tensors="tf")
self.assertEqual(batch["labels"].dtype, tf.int64)
features = [{"input_ids": [0, 1, 2, 3, 4], "label": float(i)} for i in range(4)]
batch = data_collator(features, return_tensors="tf")
self.assertEqual(batch["labels"].dtype, tf.float32)
def test_default_with_no_labels(self):
features = [{"label": None, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
batch = default_data_collator(features, return_tensors="tf")
self.assertTrue("labels" not in batch)
self.assertEqual(batch["inputs"].shape.as_list(), [8, 6])
# With label_ids
features = [{"label_ids": None, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
batch = default_data_collator(features, return_tensors="tf")
self.assertTrue("labels" not in batch)
self.assertEqual(batch["inputs"].shape.as_list(), [8, 6])
def test_data_collator_with_padding(self):
tokenizer = BertTokenizer(self.vocab_file)
features = [{"input_ids": [0, 1, 2]}, {"input_ids": [0, 1, 2, 3, 4, 5]}]
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 6])
self.assertEqual(batch["input_ids"][0].numpy().tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3)
data_collator = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=10, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape, [2, 8])
def test_data_collator_for_token_classification(self):
tokenizer = BertTokenizer(self.vocab_file)
features = [
{"input_ids": [0, 1, 2], "labels": [0, 1, 2]},
{"input_ids": [0, 1, 2, 3, 4, 5], "labels": [0, 1, 2, 3, 4, 5]},
]
data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 6])
self.assertEqual(batch["input_ids"][0].numpy().tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3)
self.assertEqual(batch["labels"].shape.as_list(), [2, 6])
self.assertEqual(batch["labels"][0].numpy().tolist(), [0, 1, 2] + [-100] * 3)
data_collator = DataCollatorForTokenClassification(
tokenizer, padding="max_length", max_length=10, return_tensors="tf"
)
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 8])
self.assertEqual(batch["labels"].shape.as_list(), [2, 8])
data_collator = DataCollatorForTokenClassification(tokenizer, label_pad_token_id=-1, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 6])
self.assertEqual(batch["input_ids"][0].numpy().tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3)
self.assertEqual(batch["labels"].shape.as_list(), [2, 6])
self.assertEqual(batch["labels"][0].numpy().tolist(), [0, 1, 2] + [-1] * 3)
def test_data_collator_for_seq2seq(self):
def create_features():
return [
{"input_ids": list(range(3)), "labels": list(range(3))},
{"input_ids": list(range(6)), "labels": list(range(6))},
]
tokenizer = BertTokenizer(self.vocab_file)
features = create_features()
data_collator = DataCollatorForSeq2Seq(tokenizer, padding=PaddingStrategy.LONGEST, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 6])
self.assertEqual(batch["input_ids"][0].numpy().tolist(), list(range(3)) + [tokenizer.pad_token_id] * 3)
self.assertEqual(batch["input_ids"][1].numpy().tolist(), list(range(6)))
self.assertEqual(batch["labels"].shape.as_list(), [2, 6])
self.assertEqual(batch["labels"][0].numpy().tolist(), list(range(3)) + [-100] * 3)
self.assertEqual(batch["labels"][1].numpy().tolist(), list(range(6)))
data_collator = DataCollatorForSeq2Seq(
tokenizer, padding=PaddingStrategy.MAX_LENGTH, max_length=7, return_tensors="tf"
)
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 7])
self.assertEqual(batch["input_ids"][0].numpy().tolist(), list(range(3)) + [tokenizer.pad_token_id] * 4)
self.assertEqual(batch["input_ids"][1].numpy().tolist(), list(range(6)) + [tokenizer.pad_token_id] * 1)
self.assertEqual(batch["labels"].shape.as_list(), [2, 7])
self.assertEqual(batch["labels"][0].numpy().tolist(), list(range(3)) + [-100] * 4)
self.assertEqual(batch["labels"][1].numpy().tolist(), list(range(6)) + [-100] * 1)
data_collator = DataCollatorForSeq2Seq(tokenizer, padding=PaddingStrategy.DO_NOT_PAD, return_tensors="tf")
with self.assertRaises(ValueError):
# expects an error due to unequal shapes to create tensor
data_collator(features)
batch = data_collator([features[0], features[0]])
self.assertEqual(batch["input_ids"][0].numpy().tolist(), features[0]["input_ids"])
self.assertEqual(batch["input_ids"][1].numpy().tolist(), features[0]["input_ids"])
self.assertEqual(batch["labels"][0].numpy().tolist(), features[0]["labels"])
self.assertEqual(batch["labels"][1].numpy().tolist(), features[0]["labels"])
data_collator = DataCollatorForSeq2Seq(
tokenizer, padding=PaddingStrategy.LONGEST, pad_to_multiple_of=8, return_tensors="tf"
)
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 8])
self.assertEqual(batch["labels"].shape.as_list(), [2, 8])
# side effects on labels cause mismatch on longest strategy
features = create_features()
data_collator = DataCollatorForSeq2Seq(
tokenizer, padding=PaddingStrategy.LONGEST, label_pad_token_id=-1, return_tensors="tf"
)
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 6])
self.assertEqual(batch["input_ids"][0].numpy().tolist(), list(range(3)) + [tokenizer.pad_token_id] * 3)
self.assertEqual(batch["input_ids"][1].numpy().tolist(), list(range(6)))
self.assertEqual(batch["labels"].shape.as_list(), [2, 6])
self.assertEqual(batch["labels"][0].numpy().tolist(), list(range(3)) + [-1] * 3)
self.assertEqual(batch["labels"][1].numpy().tolist(), list(range(6)))
for feature in features:
feature.pop("labels")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 6])
self.assertEqual(batch["input_ids"][0].numpy().tolist(), list(range(3)) + [tokenizer.pad_token_id] * 3)
def _test_no_pad_and_pad(self, no_pad_features, pad_features):
tokenizer = BertTokenizer(self.vocab_file)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf")
batch = data_collator(no_pad_features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
batch = data_collator(pad_features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
data_collator = DataCollatorForLanguageModeling(
tokenizer, mlm=False, pad_to_multiple_of=8, return_tensors="tf"
)
batch = data_collator(no_pad_features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 16])
self.assertEqual(batch["labels"].shape.as_list(), [2, 16])
batch = data_collator(pad_features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 16])
self.assertEqual(batch["labels"].shape.as_list(), [2, 16])
tokenizer.pad_token = None
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf")
with self.assertRaises(ValueError):
# Expect error due to padding token missing
data_collator(pad_features)
set_seed(42) # For reproducibility
tokenizer = BertTokenizer(self.vocab_file)
data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="tf")
batch = data_collator(no_pad_features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
self.assertTrue(tf.reduce_any(masked_tokens))
# self.assertTrue(all(x == -100 for x in batch["labels"].numpy()[~masked_tokens.numpy()].tolist()))
batch = data_collator(pad_features, return_tensors="tf")
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
self.assertTrue(tf.reduce_any(masked_tokens))
# self.assertTrue(all(x == -100 for x in batch["labels"].numpy()[~masked_tokens.numpy()].tolist()))
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="tf")
batch = data_collator(no_pad_features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 16])
self.assertEqual(batch["labels"].shape.as_list(), [2, 16])
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
self.assertTrue(tf.reduce_any(masked_tokens))
# self.assertTrue(all(x == -100 for x in batch["labels"].numpy()[~masked_tokens.numpy()].tolist()))
batch = data_collator(pad_features, return_tensors="tf")
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 16])
self.assertEqual(batch["labels"].shape.as_list(), [2, 16])
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
self.assertTrue(tf.reduce_any(masked_tokens))
# self.assertTrue(all(x == -100 for x in batch["labels"].numpy()[~masked_tokens.numpy()].tolist()))
def test_probability_sum_error(self):
"""Test that the sum of mask_replace_prob and random_replace_prob exceeding 1 raises an error."""
tokenizer = BertTokenizer(self.vocab_file)
with self.assertRaises(ValueError):
DataCollatorForLanguageModeling(tokenizer=tokenizer, mask_replace_prob=0.9, random_replace_prob=0.2)
def test_all_mask_replacement(self):
"""Test behavior when mask_replace_prob=1."""
tokenizer = BertTokenizer(self.vocab_file)
# pytorch call
collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mask_replace_prob=1, random_replace_prob=0, return_tensors="pt"
)
inputs = torch.tensor([0, 1, 2, 3, 4, 5])
features = [{"input_ids": inputs} for _ in range(8)]
batch = collator(features)
# confirm that every token is either the original token or [MASK]
self.assertTrue(torch.all((batch["input_ids"] == inputs) | (batch["input_ids"] == tokenizer.mask_token_id)))
# tf call
collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mask_replace_prob=1, random_replace_prob=0, return_tensors="tf"
)
inputs = tf.constant([0, 1, 2, 3, 4, 5])
features = [{"input_ids": inputs} for _ in range(8)]
batch = collator(features)
# confirm that every token is either the original token or [MASK]
self.assertTrue(
tf.reduce_all(
(batch["input_ids"] == tf.cast(inputs, tf.int64)) | (batch["input_ids"] == tokenizer.mask_token_id)
)
)
# numpy call
collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mask_replace_prob=1, random_replace_prob=0, return_tensors="np"
)
inputs = np.array([0, 1, 2, 3, 4, 5])
features = [{"input_ids": inputs} for _ in range(8)]
batch = collator(features)
# confirm that every token is either the original token or [MASK]
self.assertTrue(np.all((batch["input_ids"] == inputs) | (batch["input_ids"] == tokenizer.mask_token_id)))
def test_data_collator_for_language_modeling(self):
no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
self._test_no_pad_and_pad(no_pad_features, pad_features)
no_pad_features = [list(range(10)), list(range(10))]
pad_features = [list(range(5)), list(range(10))]
self._test_no_pad_and_pad(no_pad_features, pad_features)
def test_data_collator_for_language_modeling_with_seed(self):
tokenizer = BertTokenizer(self.vocab_file)
features = [{"input_ids": list(range(1000))}, {"input_ids": list(range(1000))}]
# check if seed is respected between two different DataCollatorForLanguageModeling instances
data_collator = DataCollatorForLanguageModeling(tokenizer, seed=42, return_tensors="tf")
batch_1 = data_collator(features)
self.assertEqual(batch_1["input_ids"].shape.as_list(), [2, 1000])
self.assertEqual(batch_1["labels"].shape.as_list(), [2, 1000])
data_collator = DataCollatorForLanguageModeling(tokenizer, seed=42, return_tensors="tf")
batch_2 = data_collator(features)
self.assertEqual(batch_2["input_ids"].shape.as_list(), [2, 1000])
self.assertEqual(batch_2["labels"].shape.as_list(), [2, 1000])
self.assertTrue(np.all(batch_1["input_ids"] == batch_2["input_ids"]))
self.assertTrue(np.all(batch_1["labels"] == batch_2["labels"]))
# try with different seed
data_collator = DataCollatorForLanguageModeling(tokenizer, seed=43, return_tensors="tf")
batch_3 = data_collator(features)
self.assertEqual(batch_3["input_ids"].shape.as_list(), [2, 1000])
self.assertEqual(batch_3["labels"].shape.as_list(), [2, 1000])
self.assertFalse(np.all(batch_1["input_ids"] == batch_3["input_ids"]))
self.assertFalse(np.all(batch_1["labels"] == batch_3["labels"]))
def test_data_collator_for_whole_word_mask(self):
tokenizer = BertTokenizer(self.vocab_file)
data_collator = DataCollatorForWholeWordMask(tokenizer, return_tensors="tf")
features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
# Features can already be tensors
features = [{"input_ids": np.arange(10)}, {"input_ids": np.arange(10)}]
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
def test_data_collator_for_whole_word_mask_with_seed(self):
tokenizer = BertTokenizer(self.vocab_file)
features = [{"input_ids": list(range(1000))}, {"input_ids": list(range(1000))}]
# check if seed is respected between two different DataCollatorForWholeWordMask instances
data_collator = DataCollatorForWholeWordMask(tokenizer, seed=42, return_tensors="tf")
batch_1 = data_collator(features)
self.assertEqual(batch_1["input_ids"].shape.as_list(), [2, 1000])
self.assertEqual(batch_1["labels"].shape.as_list(), [2, 1000])
data_collator = DataCollatorForWholeWordMask(tokenizer, seed=42, return_tensors="tf")
batch_2 = data_collator(features)
self.assertEqual(batch_2["input_ids"].shape.as_list(), [2, 1000])
self.assertEqual(batch_2["labels"].shape.as_list(), [2, 1000])
self.assertTrue(np.all(batch_1["input_ids"] == batch_2["input_ids"]))
self.assertTrue(np.all(batch_1["labels"] == batch_2["labels"]))
# try with different seed
data_collator = DataCollatorForWholeWordMask(tokenizer, seed=43, return_tensors="tf")
batch_3 = data_collator(features)
self.assertEqual(batch_3["input_ids"].shape.as_list(), [2, 1000])
self.assertEqual(batch_3["labels"].shape.as_list(), [2, 1000])
self.assertFalse(np.all(batch_1["input_ids"] == batch_3["input_ids"]))
self.assertFalse(np.all(batch_1["labels"] == batch_3["labels"]))
def test_plm(self):
tokenizer = BertTokenizer(self.vocab_file)
no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
data_collator = DataCollatorForPermutationLanguageModeling(tokenizer, return_tensors="tf")
batch = data_collator(pad_features)
self.assertIsInstance(batch, dict)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["perm_mask"].shape.as_list(), [2, 10, 10])
self.assertEqual(batch["target_mapping"].shape.as_list(), [2, 10, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
batch = data_collator(no_pad_features)
self.assertIsInstance(batch, dict)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["perm_mask"].shape.as_list(), [2, 10, 10])
self.assertEqual(batch["target_mapping"].shape.as_list(), [2, 10, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
example = [np.random.randint(0, 5, [5])]
with self.assertRaises(ValueError):
# Expect error due to odd sequence length
data_collator(example)
def test_nsp(self):
tokenizer = BertTokenizer(self.vocab_file)
features = [
{"input_ids": [0, 1, 2, 3, 4], "token_type_ids": [0, 1, 2, 3, 4], "next_sentence_label": i}
for i in range(2)
]
data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 5])
self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 5])
self.assertEqual(batch["labels"].shape.as_list(), [2, 5])
self.assertEqual(batch["next_sentence_label"].shape.as_list(), [2])
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 8])
self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 8])
self.assertEqual(batch["labels"].shape.as_list(), [2, 8])
self.assertEqual(batch["next_sentence_label"].shape.as_list(), [2])
def test_sop(self):
tokenizer = BertTokenizer(self.vocab_file)
features = [
{
"input_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]),
"token_type_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]),
"sentence_order_label": i,
}
for i in range(2)
]
data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 5])
self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 5])
self.assertEqual(batch["labels"].shape.as_list(), [2, 5])
self.assertEqual(batch["sentence_order_label"].shape.as_list(), [2])
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 8])
self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 8])
self.assertEqual(batch["labels"].shape.as_list(), [2, 8])
self.assertEqual(batch["sentence_order_label"].shape.as_list(), [2])
@require_tf
class TFDataCollatorImmutabilityTest(unittest.TestCase):
def setUp(self):
self.tmpdirname = tempfile.mkdtemp()
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
self.vocab_file = os.path.join(self.tmpdirname, "vocab.txt")
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def _turn_to_none(self, item):
"""used to convert `item` to `None` type"""
return None
def _validate_original_data_against_collated_data(self, collator, original_data, batch_data):
# we only care about side effects, the results are tested elsewhere
collator(batch_data)
# we go through every item and convert to `primitive` datatypes if necessary
# then compares for equivalence for the original data and the data that has been passed through the collator
for original, batch in zip(original_data, batch_data):
for original_val, batch_val in zip(original.values(), batch.values()):
if isinstance(original_val, np.ndarray):
self.assertEqual(original_val.tolist(), batch_val.tolist())
elif isinstance(original_val, tf.Tensor):
self.assertEqual(original_val.numpy().tolist(), batch_val.numpy().tolist())
else:
self.assertEqual(original_val, batch_val)
def _validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
self, collator, base_data, input_key, input_datatype, label_key, label_datatype, ignore_label=False
):
# using the arguments to recreate the features with their respective (potentially new) datatypes
features_original = [
{label_key: label_datatype(sample[label_key]), input_key: input_datatype(sample[input_key])}
for sample in base_data
]
features_batch = [
{label_key: label_datatype(sample[label_key]), input_key: input_datatype(sample[input_key])}
for sample in base_data
]
# some collators do not use labels, or sometimes we want to check if the collator with labels can handle such cases
if ignore_label:
for original, batch in zip(features_original, features_batch):
original.pop(label_key)
batch.pop(label_key)
self._validate_original_data_against_collated_data(
collator=collator, original_data=features_original, batch_data=features_batch
)
def test_default_collator_immutability(self):
features_base_single_label = [{"label": i, "inputs": (0, 1, 2, 3, 4, 5)} for i in range(4)]
features_base_multiple_labels = [{"label": (0, 1, 2), "inputs": (0, 1, 2, 3, 4, 5)} for i in range(4)]
for datatype_input, datatype_label in [
(list, int),
(list, float),
(np.array, int),
(np.array, tf.constant),
(list, self._turn_to_none),
]:
self._validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
collator=lambda x: default_data_collator(x, return_tensors="tf"),
base_data=features_base_single_label,
input_key="inputs",
input_datatype=datatype_input,
label_key="label",
label_datatype=datatype_label,
)
for datatype_input, datatype_label in [(list, list), (list, self._turn_to_none)]:
self._validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
collator=lambda x: default_data_collator(x, return_tensors="tf"),
base_data=features_base_multiple_labels,
input_key="inputs",
input_datatype=datatype_input,
label_key="label",
label_datatype=datatype_label,
)
features_base_single_label_alt = [{"input_ids": (0, 1, 2, 3, 4), "label": float(i)} for i in range(4)]
self._validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
collator=lambda x: default_data_collator(x, return_tensors="tf"),
base_data=features_base_single_label_alt,
input_key="input_ids",
input_datatype=list,
label_key="label",
label_datatype=float,
)
def test_with_padding_collator_immutability(self):
tokenizer = BertTokenizer(self.vocab_file)
features_original = [{"input_ids": [0, 1, 2]}, {"input_ids": [0, 1, 2, 3, 4, 5]}]
features_batch = [{"input_ids": [0, 1, 2]}, {"input_ids": [0, 1, 2, 3, 4, 5]}]
data_collator = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=10, return_tensors="tf")
self._validate_original_data_against_collated_data(
collator=data_collator, original_data=features_original, batch_data=features_batch
)
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8, return_tensors="tf")
self._validate_original_data_against_collated_data(
collator=data_collator, original_data=features_original, batch_data=features_batch
)
def test_for_token_classification_collator_immutability(self):
tokenizer = BertTokenizer(self.vocab_file)
features_base = [
{"input_ids": (0, 1, 2), "labels": (0, 1, 2)},
{"input_ids": (0, 1, 2, 3, 4, 5), "labels": (0, 1, 2, 3, 4, 5)},
]
token_classification_collators = [
DataCollatorForTokenClassification(tokenizer, return_tensors="tf"),
DataCollatorForTokenClassification(tokenizer, padding="max_length", max_length=10, return_tensors="tf"),
DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8, return_tensors="tf"),
DataCollatorForTokenClassification(tokenizer, label_pad_token_id=-1, return_tensors="tf"),
]
for datatype_input, datatype_label in [(list, list)]:
for collator in token_classification_collators:
self._validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
collator=collator,
base_data=features_base,
input_key="input_ids",
input_datatype=datatype_input,
label_key="labels",
label_datatype=datatype_label,
)
self._validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
collator=token_classification_collators[-1],
base_data=features_base,
input_key="input_ids",
input_datatype=datatype_input,
label_key="labels",
label_datatype=datatype_label,
ignore_label=True,
)
def test_seq2seq_collator_immutability(self):
tokenizer = BertTokenizer(self.vocab_file)
features_base = [
{"input_ids": list(range(3)), "labels": list(range(3))},
{"input_ids": list(range(6)), "labels": list(range(6))},
]
seq2seq_collators = [
DataCollatorForSeq2Seq(tokenizer, padding=PaddingStrategy.LONGEST, return_tensors="tf"),
DataCollatorForSeq2Seq(tokenizer, padding=PaddingStrategy.MAX_LENGTH, max_length=7, return_tensors="tf"),
DataCollatorForSeq2Seq(
tokenizer, padding=PaddingStrategy.LONGEST, pad_to_multiple_of=8, return_tensors="tf"
),
DataCollatorForSeq2Seq(
tokenizer, padding=PaddingStrategy.LONGEST, label_pad_token_id=-1, return_tensors="tf"
),
]
for datatype_input, datatype_label in [(list, list)]:
for collator in seq2seq_collators:
self._validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
collator=collator,
base_data=features_base,
input_key="input_ids",
input_datatype=datatype_input,
label_key="labels",
label_datatype=datatype_label,
)
self._validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
collator=seq2seq_collators[-1],
base_data=features_base,
input_key="input_ids",
input_datatype=datatype_input,
label_key="labels",
label_datatype=datatype_label,
ignore_label=True,
)
features_base_no_pad = [
{"input_ids": list(range(3)), "labels": list(range(3))},
{"input_ids": list(range(3)), "labels": list(range(3))},
]
seq2seq_no_padding_collator = DataCollatorForSeq2Seq(
tokenizer, padding=PaddingStrategy.DO_NOT_PAD, return_tensors="tf"
)
for datatype_input, datatype_label in [(list, list)]:
self._validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
collator=seq2seq_no_padding_collator,
base_data=features_base_no_pad,
input_key="input_ids",
input_datatype=datatype_input,
label_key="labels",
label_datatype=datatype_label,
)
def test_language_modelling_collator_immutability(self):
tokenizer = BertTokenizer(self.vocab_file)
features_base_no_pad = [
{"input_ids": tuple(range(10)), "labels": (1,)},
{"input_ids": tuple(range(10)), "labels": (1,)},
]
features_base_pad = [
{"input_ids": tuple(range(5)), "labels": (1,)},
{"input_ids": tuple(range(5)), "labels": (1,)},
]
lm_collators = [
DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf"),
DataCollatorForLanguageModeling(tokenizer, mlm=False, pad_to_multiple_of=8, return_tensors="tf"),
DataCollatorForLanguageModeling(tokenizer, return_tensors="tf"),
DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="tf"),
]
for datatype_input, datatype_label in [(list, list)]:
for collator in lm_collators:
self._validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
collator=collator,
base_data=features_base_no_pad,
input_key="input_ids",
input_datatype=datatype_input,
label_key="labels",
label_datatype=datatype_label,
ignore_label=True,
)
self._validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
collator=collator,
base_data=features_base_pad,
input_key="input_ids",
input_datatype=datatype_input,
label_key="labels",
label_datatype=datatype_label,
ignore_label=True,
)
def test_whole_world_masking_collator_immutability(self):
tokenizer = BertTokenizer(self.vocab_file)
features_base = [
{"input_ids": list(range(10)), "labels": (1,)},
{"input_ids": list(range(10)), "labels": (1,)},
]
whole_word_masking_collator = DataCollatorForWholeWordMask(tokenizer, return_tensors="tf")
for datatype_input, datatype_label in [(list, list), (np.array, np.array)]:
self._validate_original_data_against_collated_data_on_specified_keys_and_datatypes(
collator=whole_word_masking_collator,
base_data=features_base,
input_key="input_ids",
input_datatype=datatype_input,
label_key="labels",
label_datatype=datatype_label,
ignore_label=True,
)
def test_permutation_language_modelling_collator_immutability(self):
tokenizer = BertTokenizer(self.vocab_file)
plm_collator = DataCollatorForPermutationLanguageModeling(tokenizer, return_tensors="tf")
no_pad_features_original = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
no_pad_features_batch = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
self._validate_original_data_against_collated_data(
collator=plm_collator, original_data=no_pad_features_original, batch_data=no_pad_features_batch
)
pad_features_original = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
pad_features_batch = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
self._validate_original_data_against_collated_data(
collator=plm_collator, original_data=pad_features_original, batch_data=pad_features_batch
)
def test_next_sentence_prediction_collator_immutability(self):
tokenizer = BertTokenizer(self.vocab_file)
features_original = [
{"input_ids": [0, 1, 2, 3, 4], "token_type_ids": [0, 1, 2, 3, 4], "next_sentence_label": i}
for i in range(2)
]
features_batch = [
{"input_ids": [0, 1, 2, 3, 4], "token_type_ids": [0, 1, 2, 3, 4], "next_sentence_label": i}
for i in range(2)
]
nsp_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="tf")
self._validate_original_data_against_collated_data(
collator=nsp_collator, original_data=features_original, batch_data=features_batch
)
nsp_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="tf")
self._validate_original_data_against_collated_data(
collator=nsp_collator, original_data=features_original, batch_data=features_batch
)
def test_sentence_order_prediction_collator_immutability(self):
tokenizer = BertTokenizer(self.vocab_file)
features_original = [
{
"input_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]),
"token_type_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]),
"sentence_order_label": i,
}
for i in range(2)
]
features_batch = [
{
"input_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]),
"token_type_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]),
"sentence_order_label": i,
}
for i in range(2)
]
sop_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="tf")
self._validate_original_data_against_collated_data(
collator=sop_collator, original_data=features_original, batch_data=features_batch
)
sop_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="tf")
self._validate_original_data_against_collated_data(
collator=sop_collator, original_data=features_original, batch_data=features_batch
)
class NumpyDataCollatorIntegrationTest(unittest.TestCase): class NumpyDataCollatorIntegrationTest(unittest.TestCase):
def setUp(self): def setUp(self):
self.tmpdirname = tempfile.mkdtemp() self.tmpdirname = tempfile.mkdtemp()

View File

@ -1,60 +0,0 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers import is_tf_available
from transformers.testing_utils import require_tf
if is_tf_available():
import tensorflow as tf
from transformers.activations_tf import get_tf_activation
@require_tf
class TestTFActivations(unittest.TestCase):
def test_gelu_10(self):
x = tf.constant([-100, -1.0, -0.1, 0, 0.1, 1.0, 100.0])
gelu = get_tf_activation("gelu")
gelu10 = get_tf_activation("gelu_10")
y_gelu = gelu(x)
y_gelu_10 = gelu10(x)
clipped_mask = tf.where(y_gelu_10 < 10.0, 1.0, 0.0)
self.assertEqual(tf.math.reduce_max(y_gelu_10).numpy().item(), 10.0)
self.assertTrue(np.allclose(y_gelu * clipped_mask, y_gelu_10 * clipped_mask))
def test_get_activation(self):
get_tf_activation("gelu")
get_tf_activation("gelu_10")
get_tf_activation("gelu_fast")
get_tf_activation("gelu_new")
get_tf_activation("glu")
get_tf_activation("mish")
get_tf_activation("quick_gelu")
get_tf_activation("relu")
get_tf_activation("sigmoid")
get_tf_activation("silu")
get_tf_activation("swish")
get_tf_activation("tanh")
with self.assertRaises(KeyError):
get_tf_activation("bogus")
with self.assertRaises(KeyError):
get_tf_activation(None)

View File

@ -36,7 +36,7 @@ from transformers.commands.add_new_model_like import (
retrieve_model_classes, retrieve_model_classes,
simplify_replacements, simplify_replacements,
) )
from transformers.testing_utils import require_flax, require_tf, require_torch from transformers.testing_utils import require_flax, require_torch
BERT_MODEL_FILES = { BERT_MODEL_FILES = {
@ -84,7 +84,6 @@ REPO_PATH = Path(transformers.__path__[0]).parent.parent
@require_torch @require_torch
@require_tf
@require_flax @require_flax
class TestAddNewModelLike(unittest.TestCase): class TestAddNewModelLike(unittest.TestCase):
def init_file(self, file_name, content): def init_file(self, file_name, content):

View File

@ -19,7 +19,7 @@ from pathlib import Path
from typing import Union from typing import Union
import transformers import transformers
from transformers.testing_utils import require_tf, require_torch, slow from transformers.testing_utils import require_torch, slow
logger = logging.getLogger() logger = logging.getLogger()
@ -27,7 +27,6 @@ logger = logging.getLogger()
@unittest.skip(reason="Temporarily disable the doc tests.") @unittest.skip(reason="Temporarily disable the doc tests.")
@require_torch @require_torch
@require_tf
@slow @slow
class TestCodeExamples(unittest.TestCase): class TestCodeExamples(unittest.TestCase):
def analyze_directory( def analyze_directory(

View File

@ -21,16 +21,13 @@ import transformers
# Try to import everything from transformers to ensure every object can be loaded. # Try to import everything from transformers to ensure every object can be loaded.
from transformers import * # noqa F406 from transformers import * # noqa F406
from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, require_flax, require_tf, require_torch from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, require_flax, require_torch
from transformers.utils import ContextManagers, find_labels, is_flax_available, is_tf_available, is_torch_available from transformers.utils import ContextManagers, find_labels, is_flax_available, is_torch_available
if is_torch_available(): if is_torch_available():
from transformers import BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification from transformers import BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification
if is_tf_available():
from transformers import TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification
if is_flax_available(): if is_flax_available():
from transformers import FlaxBertForPreTraining, FlaxBertForQuestionAnswering, FlaxBertForSequenceClassification from transformers import FlaxBertForPreTraining, FlaxBertForQuestionAnswering, FlaxBertForSequenceClassification
@ -107,18 +104,6 @@ class GenericUtilTests(unittest.TestCase):
self.assertEqual(find_labels(DummyModel), ["labels"]) self.assertEqual(find_labels(DummyModel), ["labels"])
@require_tf
def test_find_labels_tf(self):
self.assertEqual(find_labels(TFBertForSequenceClassification), ["labels"])
self.assertEqual(find_labels(TFBertForPreTraining), ["labels", "next_sentence_label"])
self.assertEqual(find_labels(TFBertForQuestionAnswering), ["start_positions", "end_positions"])
# find_labels works regardless of the class name (it detects the framework through inheritance)
class DummyModel(TFBertForSequenceClassification):
pass
self.assertEqual(find_labels(DummyModel), ["labels"])
@require_flax @require_flax
def test_find_labels_flax(self): def test_find_labels_flax(self):
# Flax models don't have labels # Flax models don't have labels

View File

@ -19,14 +19,13 @@ import numpy as np
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_outputs import BaseModelOutput from transformers.modeling_outputs import BaseModelOutput
from transformers.testing_utils import require_flax, require_tf, require_torch from transformers.testing_utils import require_flax, require_torch
from transformers.utils import ( from transformers.utils import (
can_return_tuple, can_return_tuple,
expand_dims, expand_dims,
filter_out_non_signature_kwargs, filter_out_non_signature_kwargs,
flatten_dict, flatten_dict,
is_flax_available, is_flax_available,
is_tf_available,
is_torch_available, is_torch_available,
reshape, reshape,
squeeze, squeeze,
@ -38,9 +37,6 @@ from transformers.utils import (
if is_flax_available(): if is_flax_available():
import jax.numpy as jnp import jax.numpy as jnp
if is_tf_available():
import tensorflow as tf
if is_torch_available(): if is_torch_available():
import torch import torch
@ -88,16 +84,6 @@ class GenericTester(unittest.TestCase):
t = torch.tensor(x) t = torch.tensor(x)
self.assertTrue(np.allclose(transpose(x, axes=(1, 2, 0)), transpose(t, axes=(1, 2, 0)).numpy())) self.assertTrue(np.allclose(transpose(x, axes=(1, 2, 0)), transpose(t, axes=(1, 2, 0)).numpy()))
@require_tf
def test_transpose_tf(self):
x = np.random.randn(3, 4)
t = tf.constant(x)
self.assertTrue(np.allclose(transpose(x), transpose(t).numpy()))
x = np.random.randn(3, 4, 5)
t = tf.constant(x)
self.assertTrue(np.allclose(transpose(x, axes=(1, 2, 0)), transpose(t, axes=(1, 2, 0)).numpy()))
@require_flax @require_flax
def test_transpose_flax(self): def test_transpose_flax(self):
x = np.random.randn(3, 4) x = np.random.randn(3, 4)
@ -125,16 +111,6 @@ class GenericTester(unittest.TestCase):
t = torch.tensor(x) t = torch.tensor(x)
self.assertTrue(np.allclose(reshape(x, (12, 5)), reshape(t, (12, 5)).numpy())) self.assertTrue(np.allclose(reshape(x, (12, 5)), reshape(t, (12, 5)).numpy()))
@require_tf
def test_reshape_tf(self):
x = np.random.randn(3, 4)
t = tf.constant(x)
self.assertTrue(np.allclose(reshape(x, (4, 3)), reshape(t, (4, 3)).numpy()))
x = np.random.randn(3, 4, 5)
t = tf.constant(x)
self.assertTrue(np.allclose(reshape(x, (12, 5)), reshape(t, (12, 5)).numpy()))
@require_flax @require_flax
def test_reshape_flax(self): def test_reshape_flax(self):
x = np.random.randn(3, 4) x = np.random.randn(3, 4)
@ -162,16 +138,6 @@ class GenericTester(unittest.TestCase):
t = torch.tensor(x) t = torch.tensor(x)
self.assertTrue(np.allclose(squeeze(x, axis=2), squeeze(t, axis=2).numpy())) self.assertTrue(np.allclose(squeeze(x, axis=2), squeeze(t, axis=2).numpy()))
@require_tf
def test_squeeze_tf(self):
x = np.random.randn(1, 3, 4)
t = tf.constant(x)
self.assertTrue(np.allclose(squeeze(x), squeeze(t).numpy()))
x = np.random.randn(1, 4, 1, 5)
t = tf.constant(x)
self.assertTrue(np.allclose(squeeze(x, axis=2), squeeze(t, axis=2).numpy()))
@require_flax @require_flax
def test_squeeze_flax(self): def test_squeeze_flax(self):
x = np.random.randn(1, 3, 4) x = np.random.randn(1, 3, 4)
@ -192,12 +158,6 @@ class GenericTester(unittest.TestCase):
t = torch.tensor(x) t = torch.tensor(x)
self.assertTrue(np.allclose(expand_dims(x, axis=1), expand_dims(t, axis=1).numpy())) self.assertTrue(np.allclose(expand_dims(x, axis=1), expand_dims(t, axis=1).numpy()))
@require_tf
def test_expand_dims_tf(self):
x = np.random.randn(3, 4)
t = tf.constant(x)
self.assertTrue(np.allclose(expand_dims(x, axis=1), expand_dims(t, axis=1).numpy()))
@require_flax @require_flax
def test_expand_dims_flax(self): def test_expand_dims_flax(self):
x = np.random.randn(3, 4) x = np.random.randn(3, 4)
@ -232,18 +192,6 @@ class GenericTester(unittest.TestCase):
self.assertTrue(to_py_obj([t1, t2]) == [x1, x2]) self.assertTrue(to_py_obj([t1, t2]) == [x1, x2])
@require_tf
def test_to_py_obj_tf(self):
x1 = [[1, 2, 3], [4, 5, 6]]
t1 = tf.constant(x1)
self.assertTrue(to_py_obj(t1) == x1)
x2 = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
t2 = tf.constant(x2)
self.assertTrue(to_py_obj(t2) == x2)
self.assertTrue(to_py_obj([t1, t2]) == [x1, x2])
@require_flax @require_flax
def test_to_py_obj_flax(self): def test_to_py_obj_flax(self):
x1 = [[1, 2, 3], [4, 5, 6]] x1 = [[1, 2, 3], [4, 5, 6]]
@ -256,25 +204,6 @@ class GenericTester(unittest.TestCase):
self.assertTrue(to_py_obj([t1, t2]) == [x1, x2]) self.assertTrue(to_py_obj([t1, t2]) == [x1, x2])
@require_torch
@require_tf
@require_flax
def test_to_py_obj_mixed(self):
x1 = [[1], [2]]
t1 = np.array(x1)
x2 = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
t2 = torch.tensor(x2)
x3 = [1, 2, 3]
t3 = tf.constant(x3)
x4 = [[[1.0, 2.0]]]
t4 = jnp.array(x4)
mixed = [(t1, t2), (t3, t4)]
self.assertTrue(to_py_obj(mixed) == [[x1, x2], [x3, x4]])
class ValidationDecoratorTester(unittest.TestCase): class ValidationDecoratorTester(unittest.TestCase):
def test_cases_no_warning(self): def test_cases_no_warning(self):

View File

@ -61,7 +61,6 @@ from transformers.testing_utils import (
require_non_hpu, require_non_hpu,
require_read_token, require_read_token,
require_safetensors, require_safetensors,
require_tf,
require_torch, require_torch,
require_torch_accelerator, require_torch_accelerator,
require_torch_multi_accelerator, require_torch_multi_accelerator,
@ -79,7 +78,6 @@ from transformers.utils.import_utils import (
is_flash_attn_2_available, is_flash_attn_2_available,
is_flash_attn_3_available, is_flash_attn_3_available,
is_flax_available, is_flax_available,
is_tf_available,
is_torch_npu_available, is_torch_npu_available,
is_torch_sdpa_available, is_torch_sdpa_available,
) )
@ -322,9 +320,6 @@ class TestModelGammaBeta(PreTrainedModel):
if is_flax_available(): if is_flax_available():
from transformers import FlaxBertModel from transformers import FlaxBertModel
if is_tf_available():
from transformers import TFBertModel
TINY_T5 = "patrickvonplaten/t5-tiny-random" TINY_T5 = "patrickvonplaten/t5-tiny-random"
TINY_BERT_FOR_TOKEN_CLASSIFICATION = "hf-internal-testing/tiny-bert-for-token-classification" TINY_BERT_FOR_TOKEN_CLASSIFICATION = "hf-internal-testing/tiny-bert-for-token-classification"
@ -1535,27 +1530,6 @@ class ModelUtilsTest(TestCasePlus):
for p1, p2 in zip(hub_model.parameters(), new_model.parameters()): for p1, p2 in zip(hub_model.parameters(), new_model.parameters()):
self.assertTrue(torch.equal(p1, p2)) self.assertTrue(torch.equal(p1, p2))
@require_tf
@require_safetensors
def test_safetensors_torch_from_tf(self):
hub_model = BertModel.from_pretrained("hf-internal-testing/tiny-bert-pt-only")
model = TFBertModel.from_pretrained("hf-internal-testing/tiny-bert-tf-only")
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir, safe_serialization=True)
new_model = BertModel.from_pretrained(tmp_dir)
for p1, p2 in zip(hub_model.parameters(), new_model.parameters()):
self.assertTrue(torch.equal(p1, p2))
@require_tf
def test_torch_from_tf(self):
model = TFBertModel.from_pretrained("hf-internal-testing/tiny-bert-tf-only")
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir)
_ = BertModel.from_pretrained(tmp_dir, from_tf=True)
@require_safetensors @require_safetensors
def test_safetensors_torch_from_torch_sharded(self): def test_safetensors_torch_from_torch_sharded(self):
model = BertModel.from_pretrained("hf-internal-testing/tiny-bert-pt-only") model = BertModel.from_pretrained("hf-internal-testing/tiny-bert-pt-only")