[CI] lazy loading external datasets (#37218)

This commit is contained in:
Joao Gante 2025-04-03 09:57:45 +01:00 committed by GitHub
parent a0803a9555
commit 2099287a59
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 63 additions and 37 deletions

View File

@ -25,7 +25,6 @@ from transformers import (
)
from transformers.pipelines import AudioClassificationPipeline, pipeline
from transformers.testing_utils import (
_run_pipeline_tests,
compare_pipeline_output_to_hub_spec,
is_pipeline_test,
nested_simplify,
@ -46,9 +45,15 @@ if is_torch_available():
class AudioClassificationPipelineTests(unittest.TestCase):
model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
tf_model_mapping = TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
_dataset = None
if _run_pipeline_tests:
_dataset = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
@classmethod
def _load_dataset(cls):
# Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
if cls._dataset is None:
cls._dataset = datasets.load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation"
)
def get_test_pipeline(
self,
@ -99,6 +104,7 @@ class AudioClassificationPipelineTests(unittest.TestCase):
@require_torchaudio
def run_torchaudio(self, audio_classifier):
self._load_dataset()
# test with a local file
audio = self._dataset[0]["audio"]["array"]
output = audio_classifier(audio)

View File

@ -21,7 +21,6 @@ from huggingface_hub.utils import insecure_hashlib
from transformers import MODEL_FOR_DEPTH_ESTIMATION_MAPPING, is_torch_available, is_vision_available
from transformers.pipelines import DepthEstimationPipeline, pipeline
from transformers.testing_utils import (
_run_pipeline_tests,
compare_pipeline_output_to_hub_spec,
is_pipeline_test,
nested_simplify,
@ -59,13 +58,17 @@ def hashimage(image: Image) -> str:
@require_torch
class DepthEstimationPipelineTests(unittest.TestCase):
model_mapping = MODEL_FOR_DEPTH_ESTIMATION_MAPPING
_dataset = None
if _run_pipeline_tests:
# we use revision="refs/pr/1" until the PR is merged
# https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
_dataset = datasets.load_dataset(
"hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1"
)
@classmethod
def _load_dataset(cls):
# Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
if cls._dataset is None:
# we use revision="refs/pr/1" until the PR is merged
# https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
cls._dataset = datasets.load_dataset(
"hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1"
)
def get_test_pipeline(
self,
@ -90,6 +93,7 @@ class DepthEstimationPipelineTests(unittest.TestCase):
]
def run_pipeline_test(self, depth_estimator, examples):
self._load_dataset()
outputs = depth_estimator("./tests/fixtures/tests_samples/COCO/000000039769.png")
self.assertEqual({"predicted_depth": ANY(torch.Tensor), "depth": ANY(Image.Image)}, outputs)

View File

@ -26,7 +26,6 @@ from transformers import (
)
from transformers.pipelines import ImageClassificationPipeline, pipeline
from transformers.testing_utils import (
_run_pipeline_tests,
compare_pipeline_output_to_hub_spec,
is_pipeline_test,
nested_simplify,
@ -59,13 +58,17 @@ else:
class ImageClassificationPipelineTests(unittest.TestCase):
model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
tf_model_mapping = TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
_dataset = None
if _run_pipeline_tests:
# we use revision="refs/pr/1" until the PR is merged
# https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
_dataset = datasets.load_dataset(
"hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1"
)
@classmethod
def _load_dataset(cls):
# Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
if cls._dataset is None:
# we use revision="refs/pr/1" until the PR is merged
# https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
cls._dataset = datasets.load_dataset(
"hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1"
)
def get_test_pipeline(
self,
@ -92,6 +95,7 @@ class ImageClassificationPipelineTests(unittest.TestCase):
return image_classifier, examples
def run_pipeline_test(self, image_classifier, examples):
self._load_dataset()
outputs = image_classifier("./tests/fixtures/tests_samples/COCO/000000039769.png")
self.assertEqual(

View File

@ -37,7 +37,6 @@ from transformers import (
pipeline,
)
from transformers.testing_utils import (
_run_pipeline_tests,
compare_pipeline_output_to_hub_spec,
is_pipeline_test,
nested_simplify,
@ -89,13 +88,17 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
+ (MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items() if MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING else [])
+ (MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items() if MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING else [])
)
_dataset = None
if _run_pipeline_tests:
# we use revision="refs/pr/1" until the PR is merged
# https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
_dataset = datasets.load_dataset(
"hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1"
)
@classmethod
def _load_dataset(cls):
# Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
if cls._dataset is None:
# we use revision="refs/pr/1" until the PR is merged
# https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
cls._dataset = datasets.load_dataset(
"hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1"
)
def get_test_pipeline(
self,
@ -120,6 +123,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
]
def run_pipeline_test(self, image_segmenter, examples):
self._load_dataset()
outputs = image_segmenter(
"./tests/fixtures/tests_samples/COCO/000000039769.png",
threshold=0.0,

View File

@ -25,8 +25,7 @@ from transformers import (
is_vision_available,
pipeline,
)
from transformers.testing_utils import ( #
_run_pipeline_tests,
from transformers.testing_utils import (
compare_pipeline_output_to_hub_spec,
is_pipeline_test,
nested_simplify,
@ -57,13 +56,17 @@ else:
@require_torch
class ObjectDetectionPipelineTests(unittest.TestCase):
model_mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING
_dataset = None
if _run_pipeline_tests:
# we use revision="refs/pr/1" until the PR is merged
# https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
_dataset = datasets.load_dataset(
"hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1"
)
@classmethod
def _load_dataset(cls):
# Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
if cls._dataset is None:
# we use revision="refs/pr/1" until the PR is merged
# https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
cls._dataset = datasets.load_dataset(
"hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1"
)
def get_test_pipeline(
self,
@ -85,6 +88,7 @@ class ObjectDetectionPipelineTests(unittest.TestCase):
return object_detector, ["./tests/fixtures/tests_samples/COCO/000000039769.png"]
def run_pipeline_test(self, object_detector, examples):
self._load_dataset()
outputs = object_detector("./tests/fixtures/tests_samples/COCO/000000039769.png", threshold=0.0)
self.assertGreater(len(outputs), 0)

View File

@ -19,7 +19,6 @@ from huggingface_hub import VideoClassificationOutputElement, hf_hub_download
from transformers import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING, VideoMAEFeatureExtractor
from transformers.pipelines import VideoClassificationPipeline, pipeline
from transformers.testing_utils import (
_run_pipeline_tests,
compare_pipeline_output_to_hub_spec,
is_pipeline_test,
nested_simplify,
@ -39,11 +38,15 @@ from .test_pipelines_common import ANY
@require_av
class VideoClassificationPipelineTests(unittest.TestCase):
model_mapping = MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING
example_video_filepath = None
if _run_pipeline_tests:
example_video_filepath = hf_hub_download(
repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset"
)
@classmethod
def _load_dataset(cls):
# Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
if cls.example_video_filepath is None:
cls.example_video_filepath = hf_hub_download(
repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset"
)
def get_test_pipeline(
self,
@ -54,6 +57,7 @@ class VideoClassificationPipelineTests(unittest.TestCase):
processor=None,
torch_dtype="float32",
):
self._load_dataset()
video_classifier = VideoClassificationPipeline(
model=model,
tokenizer=tokenizer,