Better pipeline type hints ✨ (#38049)

* image-classification * depth-estimation * zero-shot-image-classification * image-feature-extraction * image-segmentation * mask-generation * object-detection * zero-shot-object-detection * image-to-image * image-text-to-text * image-to-text * text-classification * text-generation * text-to-audio * text2text_generation * fixup * token-classification * document-qa * video-classification * audio-classification * automatic-speech-recognition * feature-extraction * fill-mask * zero-shot-audio-classification * Add pipeline function typing * Add code generator and checker for pipeline types * Add to makefile * style * Add to CI * Style
2025-07-01 20:00:09 +06:00 · 2025-06-13 13:44:07 +01:00 · 2025-06-13 13:44:07 +01:00 · b3b7789cbc
commit b3b7789cbc
parent c989ddd294
27 changed files with 398 additions and 72 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -184,6 +184,7 @@ jobs:
            - run: python utils/check_dummies.py
            - run: python utils/check_repo.py
            - run: python utils/check_inits.py
+            - run: python utils/check_pipeline_typing.py
            - run: python utils/check_config_docstrings.py
            - run: python utils/check_config_attributes.py
            - run: python utils/check_doctest_list.py
--- a/2
+++ b/2
@ -40,6 +40,7 @@ repo-consistency:
 	python utils/check_dummies.py
 	python utils/check_repo.py
 	python utils/check_inits.py
+	python utils/check_pipeline_typing.py
 	python utils/check_config_docstrings.py
 	python utils/check_config_attributes.py
 	python utils/check_doctest_list.py
@ -81,6 +82,7 @@ fix-copies:
 	python utils/check_copies.py --fix_and_overwrite
 	python utils/check_modular_conversion.py --fix_and_overwrite
 	python utils/check_dummies.py --fix_and_overwrite
+	python utils/check_pipeline_typing.py --fix_and_overwrite
 	python utils/check_doctest_list.py --fix_and_overwrite
 	python utils/check_docstrings.py --fix_and_overwrite

--- a/src/transformers/pipelines/init.py
+++ b/src/transformers/pipelines/init.py
@ -564,6 +564,86 @@ def clean_custom_task(task_info):
    return task_info, None


+# <generated-code>
+# fmt: off
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#                       The part of the file below was automatically generated from the code.
+#           Do NOT edit this part of the file manually as any edits will be overwritten by the generation
+#           of the file. If any change should be done, please apply the changes to the `pipeline` function
+#            below and run `python utils/check_pipeline_typing.py --fix_and_overwrite` to update the file.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+
+from typing import Literal, overload
+
+
+@overload
+def pipeline(task: Literal[None], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> Pipeline: ...
+@overload
+def pipeline(task: Literal["audio-classification"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> AudioClassificationPipeline: ...
+@overload
+def pipeline(task: Literal["automatic-speech-recognition"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> AutomaticSpeechRecognitionPipeline: ...
+@overload
+def pipeline(task: Literal["depth-estimation"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> DepthEstimationPipeline: ...
+@overload
+def pipeline(task: Literal["document-question-answering"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> DocumentQuestionAnsweringPipeline: ...
+@overload
+def pipeline(task: Literal["feature-extraction"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> FeatureExtractionPipeline: ...
+@overload
+def pipeline(task: Literal["fill-mask"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> FillMaskPipeline: ...
+@overload
+def pipeline(task: Literal["image-classification"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> ImageClassificationPipeline: ...
+@overload
+def pipeline(task: Literal["image-feature-extraction"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> ImageFeatureExtractionPipeline: ...
+@overload
+def pipeline(task: Literal["image-segmentation"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> ImageSegmentationPipeline: ...
+@overload
+def pipeline(task: Literal["image-text-to-text"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> ImageTextToTextPipeline: ...
+@overload
+def pipeline(task: Literal["image-to-image"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> ImageToImagePipeline: ...
+@overload
+def pipeline(task: Literal["image-to-text"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> ImageToTextPipeline: ...
+@overload
+def pipeline(task: Literal["mask-generation"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> MaskGenerationPipeline: ...
+@overload
+def pipeline(task: Literal["object-detection"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> ObjectDetectionPipeline: ...
+@overload
+def pipeline(task: Literal["question-answering"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> QuestionAnsweringPipeline: ...
+@overload
+def pipeline(task: Literal["summarization"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> SummarizationPipeline: ...
+@overload
+def pipeline(task: Literal["table-question-answering"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> TableQuestionAnsweringPipeline: ...
+@overload
+def pipeline(task: Literal["text-classification"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> TextClassificationPipeline: ...
+@overload
+def pipeline(task: Literal["text-generation"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> TextGenerationPipeline: ...
+@overload
+def pipeline(task: Literal["text-to-audio"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> TextToAudioPipeline: ...
+@overload
+def pipeline(task: Literal["text2text-generation"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> Text2TextGenerationPipeline: ...
+@overload
+def pipeline(task: Literal["token-classification"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> TokenClassificationPipeline: ...
+@overload
+def pipeline(task: Literal["translation"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> TranslationPipeline: ...
+@overload
+def pipeline(task: Literal["video-classification"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> VideoClassificationPipeline: ...
+@overload
+def pipeline(task: Literal["visual-question-answering"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> VisualQuestionAnsweringPipeline: ...
+@overload
+def pipeline(task: Literal["zero-shot-audio-classification"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> ZeroShotAudioClassificationPipeline: ...
+@overload
+def pipeline(task: Literal["zero-shot-classification"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> ZeroShotClassificationPipeline: ...
+@overload
+def pipeline(task: Literal["zero-shot-image-classification"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> ZeroShotImageClassificationPipeline: ...
+@overload
+def pipeline(task: Literal["zero-shot-object-detection"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: Optional[bool] = None, model_kwargs: Optional[Dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> ZeroShotObjectDetectionPipeline: ...
+
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#                       The part of the file above was automatically generated from the code.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# fmt: on
+# </generated-code>
+
+
 def pipeline(
    task: Optional[str] = None,
    model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None,
@ -577,12 +657,12 @@ def pipeline(
    use_fast: bool = True,
    token: Optional[Union[str, bool]] = None,
    device: Optional[Union[int, str, "torch.device"]] = None,
-    device_map=None,
-    torch_dtype=None,
+    device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None,
+    torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
    trust_remote_code: Optional[bool] = None,
    model_kwargs: Optional[Dict[str, Any]] = None,
    pipeline_class: Optional[Any] = None,
-    **kwargs,
+    **kwargs: Any,
 ) -> Pipeline:
    """
    Utility factory method to build a [`Pipeline`].
--- a/src/transformers/pipelines/audio_classification.py
+++ b/src/transformers/pipelines/audio_classification.py
@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import subprocess
-from typing import Union
+from typing import Any, Dict, List, Union

 import numpy as np
 import requests
@ -27,7 +27,7 @@ if is_torch_available():
 logger = logging.get_logger(__name__)


-def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
+def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.ndarray:
    """
    Helper function to read an audio file through ffmpeg.
    """
@ -103,11 +103,7 @@ class AudioClassificationPipeline(Pipeline):

        self.check_model_type(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES)

-    def __call__(
-        self,
-        inputs: Union[np.ndarray, bytes, str],
-        **kwargs,
-    ):
+    def __call__(self, inputs: Union[np.ndarray, bytes, str, dict], **kwargs: Any) -> List[Dict[str, Any]]:
        """
        Classify the sequence(s) given as inputs. See the [`AutomaticSpeechRecognitionPipeline`] documentation for more
        information.
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections import defaultdict
-from typing import TYPE_CHECKING, Dict, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union

 import numpy as np
 import requests
@ -211,11 +211,7 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):

        super().__init__(model, tokenizer, feature_extractor, device=device, torch_dtype=torch_dtype, **kwargs)

-    def __call__(
-        self,
-        inputs: Union[np.ndarray, bytes, str],
-        **kwargs,
-    ):
+    def __call__(self, inputs: Union[np.ndarray, bytes, str, dict], **kwargs: Any) -> List[Dict[str, Any]]:
        """
        Transcribe the audio sequence(s) given as inputs to text. See the [`AutomaticSpeechRecognitionPipeline`]
        documentation for more information.
--- a/src/transformers/pipelines/depth_estimation.py
+++ b/src/transformers/pipelines/depth_estimation.py
@ -1,4 +1,4 @@
-from typing import List, Union
+from typing import Any, Dict, List, Union, overload

 from ..utils import (
    add_end_docstrings,
@ -52,7 +52,15 @@ class DepthEstimationPipeline(Pipeline):
        requires_backends(self, "vision")
        self.check_model_type(MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES)

-    def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs):
+    @overload
+    def __call__(self, inputs: Union[str, "Image.Image"], **kwargs: Any) -> Dict[str, Any]: ...
+
+    @overload
+    def __call__(self, inputs: List[Union[str, "Image.Image"]], **kwargs: Any) -> List[Dict[str, Any]]: ...
+
+    def __call__(
+        self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs: Any
+    ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
        """
        Predict the depth(s) of the image(s) passed as inputs.

--- a/src/transformers/pipelines/document_question_answering.py
+++ b/src/transformers/pipelines/document_question_answering.py
@ -13,7 +13,7 @@
 # limitations under the License.

 import re
-from typing import List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union, overload

 import numpy as np

@ -209,13 +209,28 @@ class DocumentQuestionAnsweringPipeline(ChunkPipeline):

        return preprocess_params, forward_params, postprocess_params

+    @overload
    def __call__(
        self,
        image: Union["Image.Image", str],
+        question: str,
+        word_boxes: Optional[Tuple[str, List[float]]] = None,
+        **kwargs: Any,
+    ) -> List[Dict[str, Any]]: ...
+
+    @overload
+    def __call__(self, image: Dict[str, Any], **kwargs: Any) -> List[Dict[str, Any]]: ...
+
+    @overload
+    def __call__(self, image: List[Dict[str, Any]], **kwargs: Any) -> List[List[Dict[str, Any]]]: ...
+
+    def __call__(
+        self,
+        image: Union["Image.Image", str, List[Dict[str, Any]]],
        question: Optional[str] = None,
        word_boxes: Optional[Tuple[str, List[float]]] = None,
-        **kwargs,
-    ):
+        **kwargs: Any,
+    ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
        """
        Answer the question(s) given as inputs by using the document(s). A document is defined as an image and an
        optional list of (word, box) tuples which represent the text in the document. If the `word_boxes` are not
--- a/src/transformers/pipelines/feature_extraction.py
+++ b/src/transformers/pipelines/feature_extraction.py
@ -1,4 +1,4 @@
-from typing import Dict
+from typing import Any, Dict, List, Union

 from ..utils import add_end_docstrings
 from .base import GenericTensor, Pipeline, build_pipeline_init_args
@ -73,9 +73,9 @@ class FeatureExtractionPipeline(Pipeline):
        elif self.framework == "tf":
            return model_outputs[0].numpy().tolist()

-    def __call__(self, *args, **kwargs):
+    def __call__(self, *args: Union[str, List[str]], **kwargs: Any) -> Union[Any, List[Any]]:
        """
-        Extract the features of the input(s).
+        Extract the features of the input(s) text.

        Args:
            args (`str` or `List[str]`): One or several texts (or one list of texts) to get the features of.
--- a/src/transformers/pipelines/fill_mask.py
+++ b/src/transformers/pipelines/fill_mask.py
@ -1,4 +1,4 @@
-from typing import Dict
+from typing import Any, Dict, List, Union, overload

 import numpy as np

@ -245,7 +245,15 @@ class FillMaskPipeline(Pipeline):
            )
        return preprocess_params, {}, postprocess_params

-    def __call__(self, inputs, **kwargs):
+    @overload
+    def __call__(self, inputs: str, **kwargs: Any) -> List[Dict[str, Any]]: ...
+
+    @overload
+    def __call__(self, inputs: List[str], **kwargs: Any) -> List[List[Dict[str, Any]]]: ...
+
+    def __call__(
+        self, inputs: Union[str, List[str]], **kwargs: Any
+    ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
        """
        Fill the masked token in the text(s) given as inputs.

--- a/src/transformers/pipelines/image_classification.py
+++ b/src/transformers/pipelines/image_classification.py
@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Union
+from typing import Any, Dict, List, Union, overload

 import numpy as np

@ -122,7 +122,15 @@ class ImageClassificationPipeline(Pipeline):
            postprocess_params["function_to_apply"] = function_to_apply
        return preprocess_params, {}, postprocess_params

-    def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs):
+    @overload
+    def __call__(self, inputs: Union[str, "Image.Image"], **kwargs: Any) -> List[Dict[str, Any]]: ...
+
+    @overload
+    def __call__(self, inputs: Union[List[str], List["Image.Image"]], **kwargs: Any) -> List[List[Dict[str, Any]]]: ...
+
+    def __call__(
+        self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs: Any
+    ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
        """
        Assign labels to the image(s) passed as inputs.

--- a/src/transformers/pipelines/image_feature_extraction.py
+++ b/src/transformers/pipelines/image_feature_extraction.py
@ -1,10 +1,12 @@
-from typing import Dict
+from typing import Any, Dict, List, Union

 from ..utils import add_end_docstrings, is_vision_available
 from .base import GenericTensor, Pipeline, build_pipeline_init_args


 if is_vision_available():
+    from PIL import Image
+
    from ..image_utils import load_image


@ -88,7 +90,7 @@ class ImageFeatureExtractionPipeline(Pipeline):
        elif self.framework == "tf":
            return outputs.numpy().tolist()

-    def __call__(self, *args, **kwargs):
+    def __call__(self, *args: Union[str, "Image.Image", List["Image.Image"], List[str]], **kwargs: Any) -> List[Any]:
        """
        Extract the features of the input(s).

--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Union, overload

 import numpy as np

@ -23,10 +23,6 @@ if is_torch_available():
 logger = logging.get_logger(__name__)


-Prediction = Dict[str, Any]
-Predictions = List[Prediction]
-
-
@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
 class ImageSegmentationPipeline(Pipeline):
    """
@ -94,7 +90,15 @@ class ImageSegmentationPipeline(Pipeline):

        return preprocess_kwargs, {}, postprocess_kwargs

-    def __call__(self, inputs=None, **kwargs) -> Union[Predictions, List[Prediction]]:
+    @overload
+    def __call__(self, inputs: Union[str, "Image.Image"], **kwargs: Any) -> List[Dict[str, Any]]: ...
+
+    @overload
+    def __call__(self, inputs: Union[List[str], List["Image.Image"]], **kwargs: Any) -> List[List[Dict[str, Any]]]: ...
+
+    def __call__(
+        self, inputs: Union[str, "Image.Image", List[str], List["Image.Image"]], **kwargs: Any
+    ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
        """
        Perform segmentation (detect masks & classes) in the image(s) passed as inputs.

@ -123,9 +127,8 @@ class ImageSegmentationPipeline(Pipeline):
                the call may block forever.

        Return:
-            A dictionary or a list of dictionaries containing the result. If the input is a single image, will return a
-            list of dictionaries, if the input is a list of several images, will return a list of list of dictionaries
-            corresponding to each image.
+            If the input is a single image, will return a list of dictionaries, if the input is a list of several images,
+            will return a list of list of dictionaries corresponding to each image.

            The dictionaries contain the mask, label and score (where applicable) of each detected object and contains
            the following keys:
--- a/src/transformers/pipelines/image_text_to_text.py
+++ b/src/transformers/pipelines/image_text_to_text.py
@ -15,7 +15,7 @@

 import enum
 from collections.abc import Iterable  # pylint: disable=g-importing-member
-from typing import Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union, overload

 from ..generation import GenerationConfig
 from ..processing_utils import ProcessingKwargs, Unpack
@ -251,6 +251,22 @@ class ImageTextToTextPipeline(Pipeline):
            generate_kwargs["eos_token_id"] = stop_sequence_ids[0]
        return preprocess_params, forward_kwargs, postprocess_params

+    @overload
+    def __call__(
+        self,
+        image: Optional[Union[str, "Image.Image"]] = None,
+        text: Optional[str] = None,
+        **kwargs: Any,
+    ) -> List[Dict[str, Any]]: ...
+
+    @overload
+    def __call__(
+        self,
+        image: Optional[Union[List[str], List["Image.Image"]]] = None,
+        text: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> List[List[Dict[str, Any]]]: ...
+
    def __call__(
        self,
        images: Optional[
@ -266,7 +282,7 @@ class ImageTextToTextPipeline(Pipeline):
        ] = None,
        text: Optional[Union[str, List[str], List[dict]]] = None,
        **kwargs,
-    ):
+    ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
        """
        Generate a text given text and the image(s) passed as inputs.

--- a/src/transformers/pipelines/image_to_image.py
+++ b/src/transformers/pipelines/image_to_image.py
@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Union
+from typing import Any, List, Union, overload

 import numpy as np

@ -84,8 +84,14 @@ class ImageToImagePipeline(Pipeline):

        return preprocess_params, forward_params, postprocess_params

+    @overload
+    def __call__(self, images: Union[str, "Image.Image"], **kwargs: Any) -> "Image.Image": ...
+
+    @overload
+    def __call__(self, images: Union[List[str], List["Image.Image"]], **kwargs: Any) -> List["Image.Image"]: ...
+
    def __call__(
-        self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs
+        self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs: Any
    ) -> Union["Image.Image", List["Image.Image"]]:
        """
        Transform the image(s) passed as inputs.
--- a/src/transformers/pipelines/image_to_text.py
+++ b/src/transformers/pipelines/image_to_text.py
@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from typing import List, Union
+from typing import Any, Dict, List, Union, overload

 from ..generation import GenerationConfig
 from ..utils import (
@ -111,7 +111,13 @@ class ImageToTextPipeline(Pipeline):

        return preprocess_params, forward_params, {}

-    def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs):
+    @overload
+    def __call__(self, inputs: Union[str, "Image.Image"], **kwargs: Any) -> List[Dict[str, Any]]: ...
+
+    @overload
+    def __call__(self, inputs: Union[List[str], List["Image.Image"]], **kwargs: Any) -> List[List[Dict[str, Any]]]: ...
+
+    def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs):
        """
        Assign labels to the image(s) passed as inputs.

--- a/src/transformers/pipelines/mask_generation.py
+++ b/src/transformers/pipelines/mask_generation.py
@ -1,5 +1,5 @@
 from collections import defaultdict
-from typing import Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, overload

 from ..image_utils import load_image
 from ..utils import (
@ -16,6 +16,9 @@ if is_torch_available():

    from ..models.auto.modeling_auto import MODEL_FOR_MASK_GENERATION_MAPPING_NAMES

+if TYPE_CHECKING:
+    from PIL import Image
+
 logger = logging.get_logger(__name__)


@ -125,12 +128,22 @@ class MaskGenerationPipeline(ChunkPipeline):
            postprocess_kwargs["output_bboxes_mask"] = kwargs["output_bboxes_mask"]
        return preprocess_kwargs, forward_params, postprocess_kwargs

-    def __call__(self, image, *args, num_workers=None, batch_size=None, **kwargs):
+    @overload
+    def __call__(self, image: Union[str, "Image.Image"], *args: Any, **kwargs: Any) -> Dict[str, Any]: ...
+
+    @overload
+    def __call__(
+        self, image: Union[List[str], List["Image.Image"]], *args: Any, **kwargs: Any
+    ) -> List[Dict[str, Any]]: ...
+
+    def __call__(
+        self, image: Union[str, "Image.Image", List[str], List["Image.Image"]], *args: Any, **kwargs: Any
+    ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
        """
        Generates binary segmentation masks

        Args:
-            inputs (`np.ndarray` or `bytes` or `str` or `dict`):
+            image (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                Image or list of images.
            mask_threshold (`float`, *optional*, defaults to 0.0):
                Threshold to use when turning the predicted masks into binary values.
@ -163,6 +176,8 @@ class MaskGenerationPipeline(ChunkPipeline):
                  the "object" described by the label and the mask.

        """
+        num_workers = kwargs.pop("num_workers", None)
+        batch_size = kwargs.pop("batch_size", None)
        return super().__call__(image, *args, num_workers=num_workers, batch_size=batch_size, **kwargs)

    def preprocess(
--- a/src/transformers/pipelines/object_detection.py
+++ b/src/transformers/pipelines/object_detection.py
@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Union, overload

 from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
 from .base import Pipeline, build_pipeline_init_args
@ -16,13 +16,12 @@ if is_torch_available():
        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
    )

+if TYPE_CHECKING:
+    from PIL import Image
+
 logger = logging.get_logger(__name__)


-Prediction = Dict[str, Any]
-Predictions = List[Prediction]
-
-
@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
 class ObjectDetectionPipeline(Pipeline):
    """
@ -69,7 +68,15 @@ class ObjectDetectionPipeline(Pipeline):
            postprocess_kwargs["threshold"] = kwargs["threshold"]
        return preprocess_params, {}, postprocess_kwargs

-    def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]:
+    @overload
+    def __call__(self, image: Union[str, "Image.Image"], *args: Any, **kwargs: Any) -> List[Dict[str, Any]]: ...
+
+    @overload
+    def __call__(
+        self, image: Union[List[str], List["Image.Image"]], *args: Any, **kwargs: Any
+    ) -> List[List[Dict[str, Any]]]: ...
+
+    def __call__(self, *args, **kwargs) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
        """
        Detect objects (bounding boxes & classes) in the image(s) passed as inputs.

--- a/src/transformers/pipelines/text2text_generation.py
+++ b/src/transformers/pipelines/text2text_generation.py
@ -1,5 +1,6 @@
 import enum
 import warnings
+from typing import Any, Dict, List, Union

 from ..generation import GenerationConfig
 from ..tokenization_utils import TruncationStrategy
@ -154,7 +155,7 @@ class Text2TextGenerationPipeline(Pipeline):
            del inputs["token_type_ids"]
        return inputs

-    def __call__(self, *args, **kwargs):
+    def __call__(self, *args: Union[str, List[str]], **kwargs: Any) -> List[Dict[str, str]]:
        r"""
        Generate the output text(s) using text(s) given as inputs.

--- a/src/transformers/pipelines/text_classification.py
+++ b/src/transformers/pipelines/text_classification.py
@ -1,6 +1,6 @@
 import inspect
 import warnings
-from typing import Dict
+from typing import Any, Dict, List, Union

 import numpy as np

@ -120,7 +120,11 @@ class TextClassificationPipeline(Pipeline):
            postprocess_params["function_to_apply"] = function_to_apply
        return preprocess_params, {}, postprocess_params

-    def __call__(self, inputs, **kwargs):
+    def __call__(
+        self,
+        inputs: Union[str, List[str], Dict[str, str], List[Dict[str, str]]],
+        **kwargs: Any,
+    ) -> List[Dict[str, Any]]:
        """
        Classify the text(s) given as inputs.

@ -148,7 +152,7 @@ class TextClassificationPipeline(Pipeline):
                - `"none"`: Does not apply any function on the output.

        Return:
-            A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys:
+            A list of `dict`: Each result comes as list of dictionaries with the following keys:

            - **label** (`str`) -- The label predicted.
            - **score** (`float`) -- The corresponding probability.
--- a/src/transformers/pipelines/text_generation.py
+++ b/src/transformers/pipelines/text_generation.py
@ -1,7 +1,7 @@
 import enum
 import itertools
 import types
-from typing import Dict
+from typing import Any, Dict, List, overload

 from ..generation import GenerationConfig
 from ..utils import ModelOutput, add_end_docstrings, is_tf_available, is_torch_available
@ -19,6 +19,8 @@ if is_tf_available():

    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES

+ChatType = List[Dict[str, str]]
+

 class ReturnType(enum.Enum):
    TENSORS = 0
@ -231,6 +233,18 @@ class TextGenerationPipeline(Pipeline):

        return super()._parse_and_tokenize(*args, **kwargs)

+    @overload
+    def __call__(self, text_inputs: str, **kwargs: Any) -> List[Dict[str, str]]: ...
+
+    @overload
+    def __call__(self, text_inputs: List[str], **kwargs: Any) -> List[List[Dict[str, str]]]: ...
+
+    @overload
+    def __call__(self, text_inputs: ChatType, **kwargs: Any) -> List[Dict[str, ChatType]]: ...
+
+    @overload
+    def __call__(self, text_inputs: List[ChatType], **kwargs: Any) -> List[List[Dict[str, ChatType]]]: ...
+
    def __call__(self, text_inputs, **kwargs):
        """
        Complete the prompt(s) given as inputs.
--- a/src/transformers/pipelines/text_to_audio.py
+++ b/src/transformers/pipelines/text_to_audio.py
@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.from typing import List, Union
-from typing import List, Union
+from typing import Any, Dict, List, Union, overload

 from ..generation import GenerationConfig
 from ..utils import is_torch_available
@ -173,7 +173,15 @@ class TextToAudioPipeline(Pipeline):

        return output

-    def __call__(self, text_inputs: Union[str, List[str]], **forward_params):
+    @overload
+    def __call__(self, text_inputs: str, **forward_params: Any) -> Dict[str, Any]: ...
+
+    @overload
+    def __call__(self, text_inputs: List[str], **forward_params: Any) -> List[Dict[str, Any]]: ...
+
+    def __call__(
+        self, text_inputs: Union[str, List[str]], **forward_params
+    ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
        """
        Generates speech/audio from the inputs. See the [`TextToAudioPipeline`] documentation for more information.

--- a/src/transformers/pipelines/token_classification.py
+++ b/src/transformers/pipelines/token_classification.py
@ -1,6 +1,6 @@
 import types
 import warnings
-from typing import List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union, overload

 import numpy as np

@ -217,7 +217,15 @@ class TokenClassificationPipeline(ChunkPipeline):
                    )
        return preprocess_params, {}, postprocess_params

-    def __call__(self, inputs: Union[str, List[str]], **kwargs):
+    @overload
+    def __call__(self, inputs: str, **kwargs: Any) -> List[Dict[str, str]]: ...
+
+    @overload
+    def __call__(self, inputs: List[str], **kwargs: Any) -> List[List[Dict[str, str]]]: ...
+
+    def __call__(
+        self, inputs: Union[str, List[str]], **kwargs: Any
+    ) -> Union[List[Dict[str, str]], List[List[Dict[str, str]]]]:
        """
        Classify each token of the text(s) given as inputs.

--- a/src/transformers/pipelines/video_classification.py
+++ b/src/transformers/pipelines/video_classification.py
@ -13,7 +13,7 @@
 # limitations under the License.
 import warnings
 from io import BytesIO
-from typing import List, Optional, Union
+from typing import Any, Dict, List, Optional, Union, overload

 import requests

@ -77,6 +77,12 @@ class VideoClassificationPipeline(Pipeline):
            postprocess_params["function_to_apply"] = "softmax"
        return preprocess_params, {}, postprocess_params

+    @overload
+    def __call__(self, inputs: str, **kwargs: Any) -> List[Dict[str, Any]]: ...
+
+    @overload
+    def __call__(self, inputs: List[str], **kwargs: Any) -> List[List[Dict[str, Any]]]: ...
+
    def __call__(self, inputs: Optional[Union[str, List[str]]] = None, **kwargs):
        """
        Assign labels to the video(s) passed as inputs.
--- a/src/transformers/pipelines/zero_shot_audio_classification.py
+++ b/src/transformers/pipelines/zero_shot_audio_classification.py
@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections import UserDict
-from typing import Union
+from typing import Any, Dict, List, Union

 import numpy as np
 import requests
@ -67,7 +67,7 @@ class ZeroShotAudioClassificationPipeline(Pipeline):
            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
        # No specific FOR_XXX available yet

-    def __call__(self, audios: Union[np.ndarray, bytes, str], **kwargs):
+    def __call__(self, audios: Union[np.ndarray, bytes, str, dict], **kwargs: Any) -> List[Dict[str, Any]]:
        """
        Assign labels to the audio(s) passed as inputs.

--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@ -1,6 +1,6 @@
 import warnings
 from collections import UserDict
-from typing import List, Union
+from typing import Any, Dict, List, Union, overload

 from ..utils import (
    add_end_docstrings,
@ -74,7 +74,22 @@ class ZeroShotImageClassificationPipeline(Pipeline):
            else MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
        )

-    def __call__(self, image: Union[str, List[str], "Image", List["Image"]] = None, **kwargs):
+    @overload
+    def __call__(
+        self, image: Union[str, "Image.Image"], candidate_labels: List[str], **kwargs: Any
+    ) -> List[Dict[str, Any]]: ...
+
+    @overload
+    def __call__(
+        self, image: Union[List[str], List["Image.Image"]], candidate_labels: List[str], **kwargs: Any
+    ) -> List[List[Dict[str, Any]]]: ...
+
+    def __call__(
+        self,
+        image: Union[str, List[str], "Image.Image", List["Image.Image"]],
+        candidate_labels: List[str],
+        **kwargs: Any,
+    ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
        """
        Assign labels to the image(s) passed as inputs.

@ -110,7 +125,7 @@ class ZeroShotImageClassificationPipeline(Pipeline):
            image = kwargs.pop("images")
        if image is None:
            raise ValueError("Cannot call the zero-shot-image-classification pipeline without an images argument!")
-        return super().__call__(image, **kwargs)
+        return super().__call__(image, candidate_labels=candidate_labels, **kwargs)

    def _sanitize_parameters(self, tokenizer_kwargs=None, **kwargs):
        preprocess_params = {}
--- a/src/transformers/pipelines/zero_shot_object_detection.py
+++ b/src/transformers/pipelines/zero_shot_object_detection.py
@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union, overload

 from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
 from .base import ChunkPipeline, build_pipeline_init_args
@ -62,12 +62,20 @@ class ZeroShotObjectDetectionPipeline(ChunkPipeline):
        requires_backends(self, "vision")
        self.check_model_type(MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES)

+    @overload
+    def __call__(
+        self, image: Union[str, "Image.Image"], candidate_labels: Union[str, List[str]], **kwargs: Any
+    ) -> List[Dict[str, Any]]: ...
+
+    @overload
+    def __call__(self, image: List[Dict[str, Any]], **kwargs: Any) -> List[List[Dict[str, Any]]]: ...
+
    def __call__(
        self,
        image: Union[str, "Image.Image", List[Dict[str, Any]]],
        candidate_labels: Optional[Union[str, List[str]]] = None,
-        **kwargs,
-    ):
+        **kwargs: Any,
+    ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
        """
        Detect objects (bounding boxes & classes) in the image(s) passed as inputs.

--- a/utils/check_pipeline_typing.py
+++ b/utils/check_pipeline_typing.py
@ -0,0 +1,93 @@
+import re
+
+from transformers.pipelines import SUPPORTED_TASKS, Pipeline
+
+
+HEADER = """
+# fmt: off
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#                       The part of the file below was automatically generated from the code.
+#           Do NOT edit this part of the file manually as any edits will be overwritten by the generation
+#           of the file. If any change should be done, please apply the changes to the `pipeline` function
+#            below and run `python utils/check_pipeline_typing.py --fix_and_overwrite` to update the file.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+
+from typing import Literal, overload
+
+
+"""
+
+FOOTER = """
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#                       The part of the file above was automatically generated from the code.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# fmt: on
+"""
+
+TASK_PATTERN = "task: Optional[str] = None"
+
+
+def main(pipeline_file_path: str, fix_and_overwrite: bool = False):
+    with open(pipeline_file_path, "r") as file:
+        content = file.read()
+
+    # extract generated code in between <generated-code> and </generated-code>
+    current_generated_code = re.search(r"# <generated-code>(.*)# </generated-code>", content, re.DOTALL).group(1)
+    content_without_generated_code = content.replace(current_generated_code, "")
+
+    # extract pipeline signature in between `def pipeline` and `-> Pipeline`
+    pipeline_signature = re.search(r"def pipeline(.*) -> Pipeline:", content_without_generated_code, re.DOTALL).group(
+        1
+    )
+    pipeline_signature = pipeline_signature.replace("(\n    ", "(")  # start of the signature
+    pipeline_signature = pipeline_signature.replace(",\n    ", ", ")  # intermediate arguments
+    pipeline_signature = pipeline_signature.replace(",\n)", ")")  # end of the signature
+
+    # collect and sort available pipelines
+    pipelines = [(f'"{task}"', task_info["impl"]) for task, task_info in SUPPORTED_TASKS.items()]
+    pipelines = sorted(pipelines, key=lambda x: x[0])
+    pipelines.insert(0, (None, Pipeline))
+
+    # generate new `pipeline` signatures
+    new_generated_code = ""
+    for task, pipeline_class in pipelines:
+        if TASK_PATTERN not in pipeline_signature:
+            raise ValueError(f"Can't find `{TASK_PATTERN}` in pipeline signature: {pipeline_signature}")
+        pipeline_type = pipeline_class if isinstance(pipeline_class, str) else pipeline_class.__name__
+        new_pipeline_signature = pipeline_signature.replace(TASK_PATTERN, f"task: Literal[{task}]")
+        new_generated_code += f"@overload\ndef pipeline{new_pipeline_signature} -> {pipeline_type}: ...\n"
+
+    new_generated_code = HEADER + new_generated_code + FOOTER
+    new_generated_code = new_generated_code.rstrip("\n") + "\n"
+
+    if new_generated_code != current_generated_code and fix_and_overwrite:
+        print(f"Updating {pipeline_file_path}...")
+        wrapped_current_generated_code = "# <generated-code>" + current_generated_code + "# </generated-code>"
+        wrapped_new_generated_code = "# <generated-code>" + new_generated_code + "# </generated-code>"
+        content = content.replace(wrapped_current_generated_code, wrapped_new_generated_code)
+
+        # write content to file
+        with open(pipeline_file_path, "w") as file:
+            file.write(content)
+
+    elif new_generated_code != current_generated_code and not fix_and_overwrite:
+        message = (
+            f"Found inconsistencies in {pipeline_file_path}. "
+            "Run `python utils/check_pipeline_typing.py --fix_and_overwrite` to fix them."
+        )
+        raise ValueError(message)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.")
+    parser.add_argument(
+        "--pipeline_file_path",
+        type=str,
+        default="src/transformers/pipelines/__init__.py",
+        help="Path to the pipeline file.",
+    )
+    args = parser.parse_args()
+    main(args.pipeline_file_path, args.fix_and_overwrite)