[CI] lazy loading external datasets (#37218)

This commit is contained in:
Joao Gante 2025-04-03 09:57:45 +01:00 committed by GitHub
parent a0803a9555
commit 2099287a59
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 63 additions and 37 deletions

View File

@ -25,7 +25,6 @@ from transformers import (
) )
from transformers.pipelines import AudioClassificationPipeline, pipeline from transformers.pipelines import AudioClassificationPipeline, pipeline
from transformers.testing_utils import ( from transformers.testing_utils import (
_run_pipeline_tests,
compare_pipeline_output_to_hub_spec, compare_pipeline_output_to_hub_spec,
is_pipeline_test, is_pipeline_test,
nested_simplify, nested_simplify,
@ -46,9 +45,15 @@ if is_torch_available():
class AudioClassificationPipelineTests(unittest.TestCase): class AudioClassificationPipelineTests(unittest.TestCase):
model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
tf_model_mapping = TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING tf_model_mapping = TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
_dataset = None
if _run_pipeline_tests: @classmethod
_dataset = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") def _load_dataset(cls):
# Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
if cls._dataset is None:
cls._dataset = datasets.load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation"
)
def get_test_pipeline( def get_test_pipeline(
self, self,
@ -99,6 +104,7 @@ class AudioClassificationPipelineTests(unittest.TestCase):
@require_torchaudio @require_torchaudio
def run_torchaudio(self, audio_classifier): def run_torchaudio(self, audio_classifier):
self._load_dataset()
# test with a local file # test with a local file
audio = self._dataset[0]["audio"]["array"] audio = self._dataset[0]["audio"]["array"]
output = audio_classifier(audio) output = audio_classifier(audio)

View File

@ -21,7 +21,6 @@ from huggingface_hub.utils import insecure_hashlib
from transformers import MODEL_FOR_DEPTH_ESTIMATION_MAPPING, is_torch_available, is_vision_available from transformers import MODEL_FOR_DEPTH_ESTIMATION_MAPPING, is_torch_available, is_vision_available
from transformers.pipelines import DepthEstimationPipeline, pipeline from transformers.pipelines import DepthEstimationPipeline, pipeline
from transformers.testing_utils import ( from transformers.testing_utils import (
_run_pipeline_tests,
compare_pipeline_output_to_hub_spec, compare_pipeline_output_to_hub_spec,
is_pipeline_test, is_pipeline_test,
nested_simplify, nested_simplify,
@ -59,11 +58,15 @@ def hashimage(image: Image) -> str:
@require_torch @require_torch
class DepthEstimationPipelineTests(unittest.TestCase): class DepthEstimationPipelineTests(unittest.TestCase):
model_mapping = MODEL_FOR_DEPTH_ESTIMATION_MAPPING model_mapping = MODEL_FOR_DEPTH_ESTIMATION_MAPPING
_dataset = None
if _run_pipeline_tests: @classmethod
def _load_dataset(cls):
# Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
if cls._dataset is None:
# we use revision="refs/pr/1" until the PR is merged # we use revision="refs/pr/1" until the PR is merged
# https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1 # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
_dataset = datasets.load_dataset( cls._dataset = datasets.load_dataset(
"hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1" "hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1"
) )
@ -90,6 +93,7 @@ class DepthEstimationPipelineTests(unittest.TestCase):
] ]
def run_pipeline_test(self, depth_estimator, examples): def run_pipeline_test(self, depth_estimator, examples):
self._load_dataset()
outputs = depth_estimator("./tests/fixtures/tests_samples/COCO/000000039769.png") outputs = depth_estimator("./tests/fixtures/tests_samples/COCO/000000039769.png")
self.assertEqual({"predicted_depth": ANY(torch.Tensor), "depth": ANY(Image.Image)}, outputs) self.assertEqual({"predicted_depth": ANY(torch.Tensor), "depth": ANY(Image.Image)}, outputs)

View File

@ -26,7 +26,6 @@ from transformers import (
) )
from transformers.pipelines import ImageClassificationPipeline, pipeline from transformers.pipelines import ImageClassificationPipeline, pipeline
from transformers.testing_utils import ( from transformers.testing_utils import (
_run_pipeline_tests,
compare_pipeline_output_to_hub_spec, compare_pipeline_output_to_hub_spec,
is_pipeline_test, is_pipeline_test,
nested_simplify, nested_simplify,
@ -59,11 +58,15 @@ else:
class ImageClassificationPipelineTests(unittest.TestCase): class ImageClassificationPipelineTests(unittest.TestCase):
model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
tf_model_mapping = TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING tf_model_mapping = TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
_dataset = None
if _run_pipeline_tests: @classmethod
def _load_dataset(cls):
# Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
if cls._dataset is None:
# we use revision="refs/pr/1" until the PR is merged # we use revision="refs/pr/1" until the PR is merged
# https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1 # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
_dataset = datasets.load_dataset( cls._dataset = datasets.load_dataset(
"hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1" "hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1"
) )
@ -92,6 +95,7 @@ class ImageClassificationPipelineTests(unittest.TestCase):
return image_classifier, examples return image_classifier, examples
def run_pipeline_test(self, image_classifier, examples): def run_pipeline_test(self, image_classifier, examples):
self._load_dataset()
outputs = image_classifier("./tests/fixtures/tests_samples/COCO/000000039769.png") outputs = image_classifier("./tests/fixtures/tests_samples/COCO/000000039769.png")
self.assertEqual( self.assertEqual(

View File

@ -37,7 +37,6 @@ from transformers import (
pipeline, pipeline,
) )
from transformers.testing_utils import ( from transformers.testing_utils import (
_run_pipeline_tests,
compare_pipeline_output_to_hub_spec, compare_pipeline_output_to_hub_spec,
is_pipeline_test, is_pipeline_test,
nested_simplify, nested_simplify,
@ -89,11 +88,15 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
+ (MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items() if MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING else []) + (MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items() if MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING else [])
+ (MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items() if MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING else []) + (MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items() if MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING else [])
) )
_dataset = None
if _run_pipeline_tests: @classmethod
def _load_dataset(cls):
# Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
if cls._dataset is None:
# we use revision="refs/pr/1" until the PR is merged # we use revision="refs/pr/1" until the PR is merged
# https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1 # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
_dataset = datasets.load_dataset( cls._dataset = datasets.load_dataset(
"hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1" "hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1"
) )
@ -120,6 +123,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
] ]
def run_pipeline_test(self, image_segmenter, examples): def run_pipeline_test(self, image_segmenter, examples):
self._load_dataset()
outputs = image_segmenter( outputs = image_segmenter(
"./tests/fixtures/tests_samples/COCO/000000039769.png", "./tests/fixtures/tests_samples/COCO/000000039769.png",
threshold=0.0, threshold=0.0,

View File

@ -25,8 +25,7 @@ from transformers import (
is_vision_available, is_vision_available,
pipeline, pipeline,
) )
from transformers.testing_utils import ( # from transformers.testing_utils import (
_run_pipeline_tests,
compare_pipeline_output_to_hub_spec, compare_pipeline_output_to_hub_spec,
is_pipeline_test, is_pipeline_test,
nested_simplify, nested_simplify,
@ -57,11 +56,15 @@ else:
@require_torch @require_torch
class ObjectDetectionPipelineTests(unittest.TestCase): class ObjectDetectionPipelineTests(unittest.TestCase):
model_mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING model_mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING
_dataset = None
if _run_pipeline_tests: @classmethod
def _load_dataset(cls):
# Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
if cls._dataset is None:
# we use revision="refs/pr/1" until the PR is merged # we use revision="refs/pr/1" until the PR is merged
# https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1 # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
_dataset = datasets.load_dataset( cls._dataset = datasets.load_dataset(
"hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1" "hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1"
) )
@ -85,6 +88,7 @@ class ObjectDetectionPipelineTests(unittest.TestCase):
return object_detector, ["./tests/fixtures/tests_samples/COCO/000000039769.png"] return object_detector, ["./tests/fixtures/tests_samples/COCO/000000039769.png"]
def run_pipeline_test(self, object_detector, examples): def run_pipeline_test(self, object_detector, examples):
self._load_dataset()
outputs = object_detector("./tests/fixtures/tests_samples/COCO/000000039769.png", threshold=0.0) outputs = object_detector("./tests/fixtures/tests_samples/COCO/000000039769.png", threshold=0.0)
self.assertGreater(len(outputs), 0) self.assertGreater(len(outputs), 0)

View File

@ -19,7 +19,6 @@ from huggingface_hub import VideoClassificationOutputElement, hf_hub_download
from transformers import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING, VideoMAEFeatureExtractor from transformers import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING, VideoMAEFeatureExtractor
from transformers.pipelines import VideoClassificationPipeline, pipeline from transformers.pipelines import VideoClassificationPipeline, pipeline
from transformers.testing_utils import ( from transformers.testing_utils import (
_run_pipeline_tests,
compare_pipeline_output_to_hub_spec, compare_pipeline_output_to_hub_spec,
is_pipeline_test, is_pipeline_test,
nested_simplify, nested_simplify,
@ -39,9 +38,13 @@ from .test_pipelines_common import ANY
@require_av @require_av
class VideoClassificationPipelineTests(unittest.TestCase): class VideoClassificationPipelineTests(unittest.TestCase):
model_mapping = MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING model_mapping = MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING
example_video_filepath = None
if _run_pipeline_tests: @classmethod
example_video_filepath = hf_hub_download( def _load_dataset(cls):
# Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
if cls.example_video_filepath is None:
cls.example_video_filepath = hf_hub_download(
repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset" repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset"
) )
@ -54,6 +57,7 @@ class VideoClassificationPipelineTests(unittest.TestCase):
processor=None, processor=None,
torch_dtype="float32", torch_dtype="float32",
): ):
self._load_dataset()
video_classifier = VideoClassificationPipeline( video_classifier = VideoClassificationPipeline(
model=model, model=model,
tokenizer=tokenizer, tokenizer=tokenizer,