Update modeling doc strings FE -> IP (#21106)

* Update docs examples FE -> IP

* Remove _IMAGE_PROCESSOR_FOR_DOC
This commit is contained in:
amyeroberts 2023-01-20 11:18:10 +00:00 committed by GitHub
parent 5d3cb760a0
commit 91c2278b97
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
50 changed files with 59 additions and 161 deletions

View File

@ -98,7 +98,7 @@ ALTCLIP_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
@ -133,7 +133,7 @@ ALTCLIP_INPUTS_DOCSTRING = r"""
[What are position IDs?](../glossary#position-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
output_attentions (`bool`, *optional*):

View File

@ -49,7 +49,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "BeitConfig"
_FEAT_EXTRACTOR_FOR_DOC = "BeitImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/beit-base-patch16-224-pt22k"
@ -646,7 +645,6 @@ class BeitModel(BeitPreTrainedModel):
@add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BeitModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
@ -844,7 +842,6 @@ class BeitForImageClassification(BeitPreTrainedModel):
@add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,

View File

@ -46,7 +46,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "BitConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
# Base docstring
_CHECKPOINT_FOR_DOC = "google/bit-50"
@ -688,8 +687,8 @@ BIT_START_DOCSTRING = r"""
BIT_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
[`AutoFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoImageProcessor.__call__`] for details.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
@ -723,7 +722,6 @@ class BitModel(BitPreTrainedModel):
@add_start_docstrings_to_model_forward(BIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndNoAttention,
config_class=_CONFIG_FOR_DOC,
@ -782,7 +780,6 @@ class BitForImageClassification(BitPreTrainedModel):
@add_start_docstrings_to_model_forward(BIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,

View File

@ -810,7 +810,7 @@ CHINESE_CLIP_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`ChineseCLIPFeatureExtractor`]. See [`ChineseCLIPFeatureExtractor.__call__`] for details.
[`ChineseCLIPImageProcessor`]. See [`ChineseCLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
@ -853,7 +853,7 @@ CHINESE_CLIP_INPUTS_DOCSTRING = r"""
[What are position IDs?](../glossary#position-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`ChineseCLIPFeatureExtractor`]. See [`ChineseCLIPFeatureExtractor.__call__`] for details.
[`ChineseCLIPImageProcessor`]. See [`ChineseCLIPImageProcessor.__call__`] for details.
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
output_attentions (`bool`, *optional*):

View File

@ -521,7 +521,7 @@ CLIP_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
@ -556,7 +556,7 @@ CLIP_INPUTS_DOCSTRING = r"""
[What are position IDs?](../glossary#position-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
output_attentions (`bool`, *optional*):

View File

@ -108,7 +108,7 @@ CLIP_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
@ -143,7 +143,7 @@ CLIP_INPUTS_DOCSTRING = r"""
[What are position IDs?](../glossary#position-ids)
pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
@ -893,7 +893,7 @@ class FlaxCLIPPreTrainedModel(FlaxPreTrainedModel):
Args:
pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained
using [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
using [`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
Returns:
image_features (`jnp.ndarray` of shape `(batch_size, output_dim`): The image embeddings obtained by

View File

@ -993,8 +993,8 @@ CLIP_TEXT_INPUTS_DOCSTRING = r"""
CLIP_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See
[`CLIPFeatureExtractor.__call__`] for details. output_attentions (`bool`, *optional*): Whether or not to
Pixel values. Pixel values can be obtained using [`CLIPImageProcessor`]. See
[`CLIPImageProcessor.__call__`] for details. output_attentions (`bool`, *optional*): Whether or not to
return the attentions tensors of all attention layers. See `attentions` under returned tensors for more
detail. This argument can be used only in eager mode, in graph mode the value in the config will be used
instead.
@ -1020,8 +1020,8 @@ CLIP_INPUTS_DOCSTRING = r"""
[What are input IDs?](../glossary#input-ids)
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See
[`CLIPFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`CLIPImageProcessor`]. See
[`CLIPImageProcessor.__call__`] for details.
attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

View File

@ -530,7 +530,7 @@ CLIPSEG_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
@ -565,7 +565,7 @@ CLIPSEG_INPUTS_DOCSTRING = r"""
[What are position IDs?](../glossary#position-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
output_attentions (`bool`, *optional*):

View File

@ -44,7 +44,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "ConvNextConfig"
_FEAT_EXTRACTOR_FOR_DOC = "ConvNextImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "facebook/convnext-tiny-224"
@ -346,7 +345,6 @@ class ConvNextModel(ConvNextPreTrainedModel):
@add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndNoAttention,
config_class=_CONFIG_FOR_DOC,
@ -414,7 +412,6 @@ class ConvNextForImageClassification(ConvNextPreTrainedModel):
@add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,

View File

@ -35,7 +35,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "CvtConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/cvt-13"
@ -605,7 +604,6 @@ class CvtModel(CvtPreTrainedModel):
@add_start_docstrings_to_model_forward(CVT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithCLSToken,
config_class=_CONFIG_FOR_DOC,
@ -668,7 +666,6 @@ class CvtForImageClassification(CvtPreTrainedModel):
@add_start_docstrings_to_model_forward(CVT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,

View File

@ -48,7 +48,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "Data2VecVisionConfig"
_FEAT_EXTRACTOR_FOR_DOC = "BeitImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "facebook/data2vec-vision-base"
@ -660,7 +659,6 @@ class Data2VecVisionModel(Data2VecVisionPreTrainedModel):
@add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Data2VecVisionModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
@ -760,7 +758,6 @@ class Data2VecVisionForImageClassification(Data2VecVisionPreTrainedModel):
@add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,

View File

@ -53,7 +53,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "Data2VecVisionConfig"
_FEAT_EXTRACTOR_FOR_DOC = "BeitImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "facebook/data2vec-vision-base"
@ -894,7 +893,6 @@ class TFData2VecVisionModel(TFData2VecVisionPreTrainedModel):
@unpack_inputs
@add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFData2VecVisionModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
@ -960,7 +958,6 @@ class TFData2VecVisionForImageClassification(TFData2VecVisionPreTrainedModel, TF
@unpack_inputs
@add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,

View File

@ -44,7 +44,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "DeiTConfig"
_FEAT_EXTRACTOR_FOR_DOC = "DeiTImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "facebook/deit-base-distilled-patch16-224"
@ -483,7 +482,6 @@ class DeiTModel(DeiTPreTrainedModel):
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
@ -854,7 +852,6 @@ class DeiTForImageClassificationWithTeacher(DeiTPreTrainedModel):
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=DeiTForImageClassificationWithTeacherOutput,
config_class=_CONFIG_FOR_DOC,

View File

@ -52,7 +52,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "DeiTConfig"
_FEAT_EXTRACTOR_FOR_DOC = "DeiTImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "facebook/deit-base-distilled-patch16-224"
@ -651,7 +650,6 @@ class TFDeiTModel(TFDeiTPreTrainedModel):
@unpack_inputs
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
@ -1009,7 +1007,6 @@ class TFDeiTForImageClassificationWithTeacher(TFDeiTPreTrainedModel):
@unpack_inputs
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=TFDeiTForImageClassificationWithTeacherOutput,
config_class=_CONFIG_FOR_DOC,

View File

@ -57,7 +57,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "DinatConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "shi-labs/dinat-mini-in1k-224"
@ -730,7 +729,6 @@ class DinatModel(DinatPreTrainedModel):
@add_start_docstrings_to_model_forward(DINAT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=DinatModelOutput,
config_class=_CONFIG_FOR_DOC,
@ -810,7 +808,6 @@ class DinatForImageClassification(DinatPreTrainedModel):
@add_start_docstrings_to_model_forward(DINAT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=DinatImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,

View File

@ -43,7 +43,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "DonutSwinConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
# Base docstring
_CHECKPOINT_FOR_DOC = "https://huggingface.co/naver-clova-ix/donut-base"
@ -847,8 +846,8 @@ SWIN_START_DOCSTRING = r"""
SWIN_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
[`AutoFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoImageProcessor.__call__`] for details.
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
@ -898,7 +897,6 @@ class DonutSwinModel(DonutSwinPreTrainedModel):
@add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=DonutSwinModelOutput,
config_class=_CONFIG_FOR_DOC,

View File

@ -49,7 +49,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "DPTConfig"
_FEAT_EXTRACTOR_FOR_DOC = "DPTImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "Intel/dpt-large"
@ -898,7 +897,6 @@ class DPTModel(DPTPreTrainedModel):
@add_start_docstrings_to_model_forward(DPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndIntermediateActivations,
config_class=_CONFIG_FOR_DOC,

View File

@ -51,7 +51,6 @@ _CHECKPOINT_FOR_DOC = "facebook/flava-full"
# Codebook docstring
_CHECKPOINT_FOR_CODEBOOK_DOC = "facebook/flava-image-codebook"
_FEAT_EXTRACTOR_FOR_DOC = "FlavaFeatureExtractor"
_CONFIG_CLASS_FOR_IMAGE_MODEL_DOC = "FlavaImageConfig"
_CONFIG_CLASS_FOR_TEXT_MODEL_DOC = "FlavaTextConfig"
_CONFIG_CLASS_FOR_MULTIMODAL_MODEL_DOC = "FlavaMultimodalConfig"
@ -750,8 +749,8 @@ FLAVA_INPUTS_DOCSTRING_COMMON = r"""
FLAVA_IMAGE_INPUTS_DOCSTRING_BASE = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`FlavaFeatureExtractor`]. See
[`FlavaFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`FlavaImageProcessor`]. See
[`FlavaImageProcessor.__call__`] for details.
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
@ -926,7 +925,6 @@ class FlavaImageModel(FlavaPreTrainedModel):
@add_start_docstrings_to_model_forward(FLAVA_IMAGE_INPUTS_DOCSTRING.format("batch_size, image_num_patches"))
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPooling,
config_class=_CONFIG_CLASS_FOR_IMAGE_MODEL_DOC,
@ -1568,22 +1566,22 @@ class FlavaImageCodebook(FlavaPreTrainedModel):
"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Codebook pixel values can be obtained using [`FlavaFeatureExtractor`] by passing
`return_codebook_pixels=True`. See [`FlavaFeatureExtractor.__call__`] for details.
Pixel values. Codebook pixel values can be obtained using [`FlavaImageProcessor`] by passing
`return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.
Examples:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import FlavaFeatureExtractor, FlavaImageCodebook
>>> from transformers import FlavaImageProcessor, FlavaImageCodebook
>>> model = FlavaImageCodebook.from_pretrained("{0}")
>>> feature_extractor = FlavaFeatureExtractor.from_pretrained("{0}")
>>> image_processor = FlavaImageProcessor.from_pretrained("{0}")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = feature_extractor([image], return_codebook_pixels=True, return_tensors="pt")
>>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
>>> inputs = dict(pixel_values=inputs.codebook_pixel_values)
>>> outputs = model.get_codebook_indices(**inputs)
@ -1602,23 +1600,23 @@ class FlavaImageCodebook(FlavaPreTrainedModel):
"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Codebook pixel values can be obtained using [`FlavaFeatureExtractor`] by passing
`return_codebook_pixels=True`. See [`FlavaFeatureExtractor.__call__`] for details.
Pixel values. Codebook pixel values can be obtained using [`FlavaImageProcessor`] by passing
`return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.
Examples:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import FlavaFeatureExtractor, FlavaImageCodebook
>>> from transformers import FlavaImageProcessor, FlavaImageCodebook
>>> model = FlavaImageCodebook.from_pretrained("{0}")
>>> feature_extractor = FlavaFeatureExtractor.from_pretrained("{0}")
>>> image_processor = FlavaImageProcessor.from_pretrained("{0}")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = feature_extractor([image], return_codebook_pixels=True, return_tensors="pt")
>>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
>>> inputs = dict(pixel_values=inputs.codebook_pixel_values)
>>> outputs = model(**inputs)

View File

@ -916,7 +916,7 @@ GIT_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.

View File

@ -41,7 +41,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "GLPNConfig"
_FEAT_EXTRACTOR_FOR_DOC = "GLPNImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "vinvino02/glpn-kitti"
@ -503,7 +502,6 @@ class GLPNModel(GLPNPreTrainedModel):
@add_start_docstrings_to_model_forward(GLPN_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutput,
config_class=_CONFIG_FOR_DOC,

View File

@ -857,7 +857,7 @@ GROUPVIT_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
@ -891,8 +891,8 @@ GROUPVIT_INPUTS_DOCSTRING = r"""
[What are position IDs?](../glossary#position-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See
[`CLIPFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`CLIPImageProcessor`]. See
[`CLIPImageProcessor.__call__`] for details.
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
output_attentions (`bool`, *optional*):

View File

@ -1555,8 +1555,8 @@ GROUPVIT_TEXT_INPUTS_DOCSTRING = r"""
GROUPVIT_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See
[`CLIPFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`CLIPImageProcessor`]. See
[`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
@ -1583,8 +1583,8 @@ GROUPVIT_INPUTS_DOCSTRING = r"""
[What are input IDs?](../glossary#input-ids)
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See
[`CLIPFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`CLIPImageProcessor`]. See
[`CLIPImageProcessor.__call__`] for details.
attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

View File

@ -38,7 +38,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "LevitConfig"
_FEAT_EXTRACTOR_FOR_DOC = "LevitImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "facebook/levit-128S"
@ -549,7 +548,6 @@ class LevitModel(LevitPreTrainedModel):
@add_start_docstrings_to_model_forward(LEVIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndNoAttention,
config_class=_CONFIG_FOR_DOC,
@ -618,7 +616,6 @@ class LevitForImageClassification(LevitPreTrainedModel):
@add_start_docstrings_to_model_forward(LEVIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,
@ -711,7 +708,6 @@ class LevitForImageClassificationWithTeacher(LevitPreTrainedModel):
@add_start_docstrings_to_model_forward(LEVIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=LevitForImageClassificationWithTeacherOutput,
config_class=_CONFIG_FOR_DOC,

View File

@ -51,7 +51,6 @@ logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "MaskFormerConfig"
_CHECKPOINT_FOR_DOC = "facebook/maskformer-swin-base-ade"
_FEAT_EXTRACTOR_FOR_DOC = "MaskFormerImageProcessor"
MASKFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/maskformer-swin-base-ade",

View File

@ -33,7 +33,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "MobileNetV1Config"
_FEAT_EXTRACTOR_FOR_DOC = "MobileNetV1ImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "google/mobilenet_v1_1.0_224"
@ -355,7 +354,6 @@ class MobileNetV1Model(MobileNetV1PreTrainedModel):
@add_start_docstrings_to_model_forward(MOBILENET_V1_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndNoAttention,
config_class=_CONFIG_FOR_DOC,
@ -428,7 +426,6 @@ class MobileNetV1ForImageClassification(MobileNetV1PreTrainedModel):
@add_start_docstrings_to_model_forward(MOBILENET_V1_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,

View File

@ -43,7 +43,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "MobileNetV2Config"
_FEAT_EXTRACTOR_FOR_DOC = "MobileNetV2ImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "google/mobilenet_v2_1.0_224"
@ -566,7 +565,6 @@ class MobileNetV2Model(MobileNetV2PreTrainedModel):
@add_start_docstrings_to_model_forward(MOBILENET_V2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndNoAttention,
config_class=_CONFIG_FOR_DOC,
@ -639,8 +637,6 @@ class MobileNetV2ForImageClassification(MobileNetV2PreTrainedModel):
@add_start_docstrings_to_model_forward(MOBILENET_V2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,

View File

@ -49,7 +49,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "MobileViTConfig"
_FEAT_EXTRACTOR_FOR_DOC = "MobileViTImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "apple/mobilevit-small"
@ -745,7 +744,6 @@ class MobileViTModel(MobileViTPreTrainedModel):
@add_start_docstrings_to_model_forward(MOBILEVIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndNoAttention,
config_class=_CONFIG_FOR_DOC,
@ -819,7 +817,6 @@ class MobileViTForImageClassification(MobileViTPreTrainedModel):
@add_start_docstrings_to_model_forward(MOBILEVIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,

View File

@ -43,7 +43,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "MobileViTConfig"
_FEAT_EXTRACTOR_FOR_DOC = "MobileViTImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "apple/mobilevit-small"
@ -839,7 +838,6 @@ class TFMobileViTModel(TFMobileViTPreTrainedModel):
@unpack_inputs
@add_start_docstrings_to_model_forward(MOBILEVIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
@ -889,7 +887,6 @@ class TFMobileViTForImageClassification(TFMobileViTPreTrainedModel, TFSequenceCl
@unpack_inputs
@add_start_docstrings_to_model_forward(MOBILEVIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=TFImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,

View File

@ -57,7 +57,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "NatConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "shi-labs/nat-mini-in1k-224"
@ -708,7 +707,6 @@ class NatModel(NatPreTrainedModel):
@add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=NatModelOutput,
config_class=_CONFIG_FOR_DOC,
@ -788,7 +786,6 @@ class NatForImageClassification(NatPreTrainedModel):
@add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=NatImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,

View File

@ -204,7 +204,7 @@ class OwlViTObjectDetectionOutput(ModelOutput):
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
possible padding). You can use [`~OwlViTFeatureExtractor.post_process_object_detection`] to retrieve the
possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to retrieve the
unnormalized bounding boxes.
text_embeds (`torch.FloatTensor` of shape `(batch_size, num_max_text_queries, output_dim`):
The text embeddings obtained by applying the projection layer to the pooled output of [`OwlViTTextModel`].
@ -248,12 +248,12 @@ class OwlViTImageGuidedObjectDetectionOutput(ModelOutput):
target_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
values are normalized in [0, 1], relative to the size of each individual target image in the batch
(disregarding possible padding). You can use [`~OwlViTFeatureExtractor.post_process_object_detection`] to
(disregarding possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to
retrieve the unnormalized bounding boxes.
query_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
values are normalized in [0, 1], relative to the size of each individual query image in the batch
(disregarding possible padding). You can use [`~OwlViTFeatureExtractor.post_process_object_detection`] to
(disregarding possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to
retrieve the unnormalized bounding boxes.
image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes

View File

@ -34,7 +34,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "PoolFormerConfig"
_FEAT_EXTRACTOR_FOR_DOC = "PoolFormerImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "sail/poolformer_s12"
@ -326,7 +325,6 @@ class PoolFormerModel(PoolFormerPreTrainedModel):
@add_start_docstrings_to_model_forward(POOLFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,
@ -397,7 +395,6 @@ class PoolFormerForImageClassification(PoolFormerPreTrainedModel):
@add_start_docstrings_to_model_forward(POOLFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,

View File

@ -37,7 +37,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "RegNetConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "facebook/regnet-y-040"
@ -341,7 +340,6 @@ class RegNetModel(RegNetPreTrainedModel):
@add_start_docstrings_to_model_forward(REGNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndNoAttention,
config_class=_CONFIG_FOR_DOC,
@ -399,7 +397,6 @@ class RegNetForImageClassification(RegNetPreTrainedModel):
@add_start_docstrings_to_model_forward(REGNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,

View File

@ -35,7 +35,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "RegNetConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "facebook/regnet-y-040"
@ -411,7 +410,6 @@ class TFRegNetModel(TFRegNetPreTrainedModel):
@unpack_inputs
@add_start_docstrings_to_model_forward(REGNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutputWithPoolingAndNoAttention,
config_class=_CONFIG_FOR_DOC,
@ -477,7 +475,6 @@ class TFRegNetForImageClassification(TFRegNetPreTrainedModel, TFSequenceClassifi
@unpack_inputs
@add_start_docstrings_to_model_forward(REGNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,

View File

@ -43,7 +43,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "ResNetConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/resnet-50"
@ -312,7 +311,6 @@ class ResNetModel(ResNetPreTrainedModel):
@add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndNoAttention,
config_class=_CONFIG_FOR_DOC,
@ -369,7 +367,6 @@ class ResNetForImageClassification(ResNetPreTrainedModel):
@add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,

View File

@ -34,7 +34,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "ResNetConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/resnet-50"
@ -393,7 +392,6 @@ class TFResNetModel(TFResNetPreTrainedModel):
@add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutputWithPoolingAndNoAttention,
config_class=_CONFIG_FOR_DOC,
@ -458,7 +456,6 @@ class TFResNetForImageClassification(TFResNetPreTrainedModel, TFSequenceClassifi
@add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=TFImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,

View File

@ -42,7 +42,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "SegformerConfig"
_FEAT_EXTRACTOR_FOR_DOC = "SegformerImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "nvidia/mit-b0"
@ -529,7 +528,6 @@ class SegformerModel(SegformerPreTrainedModel):
@add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutput,
config_class=_CONFIG_FOR_DOC,
@ -589,7 +587,6 @@ class SegformerForImageClassification(SegformerPreTrainedModel):
@add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=SegFormerImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,

View File

@ -37,7 +37,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "SegformerConfig"
_FEAT_EXTRACTOR_FOR_DOC = "SegformerImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "nvidia/mit-b0"
@ -606,7 +605,6 @@ class TFSegformerModel(TFSegformerPreTrainedModel):
@unpack_inputs
@add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutput,
config_class=_CONFIG_FOR_DOC,
@ -659,7 +657,6 @@ class TFSegformerForImageClassification(TFSegformerPreTrainedModel, TFSequenceCl
@unpack_inputs
@add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,

View File

@ -44,7 +44,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "SwinConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/swin-tiny-patch4-window7-224"
@ -963,7 +962,6 @@ class SwinModel(SwinPreTrainedModel):
@add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SwinModelOutput,
config_class=_CONFIG_FOR_DOC,
@ -1168,7 +1166,6 @@ class SwinForImageClassification(SwinPreTrainedModel):
@add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=SwinImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,

View File

@ -47,7 +47,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "SwinConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/swin-tiny-patch4-window7-224"
@ -1192,7 +1191,6 @@ class TFSwinModel(TFSwinPreTrainedModel):
@add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFSwinModelOutput,
config_class=_CONFIG_FOR_DOC,
@ -1429,7 +1427,6 @@ class TFSwinForImageClassification(TFSwinPreTrainedModel, TFSequenceClassificati
@add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=TFSwinImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,

View File

@ -43,7 +43,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "Swin2SRConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "caidas/swin2SR-classical-sr-x2-64"
@ -823,8 +822,8 @@ SWIN2SR_START_DOCSTRING = r"""
SWIN2SR_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
[`AutoFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoImageProcessor.__call__`] for details.
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
@ -897,7 +896,6 @@ class Swin2SRModel(Swin2SRPreTrainedModel):
@add_start_docstrings_to_model_forward(SWIN2SR_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutput,
config_class=_CONFIG_FOR_DOC,

View File

@ -43,7 +43,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "Swinv2Config"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/swinv2-tiny-patch4-window8-256"
@ -1043,7 +1042,6 @@ class Swinv2Model(Swinv2PreTrainedModel):
@add_start_docstrings_to_model_forward(SWINV2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Swinv2ModelOutput,
config_class=_CONFIG_FOR_DOC,
@ -1251,7 +1249,6 @@ class Swinv2ForImageClassification(Swinv2PreTrainedModel):
@add_start_docstrings_to_model_forward(SWINV2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=Swinv2ImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,

View File

@ -38,7 +38,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "VanConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "Visual-Attention-Network/van-base"
@ -435,7 +434,6 @@ class VanModel(VanPreTrainedModel):
@add_start_docstrings_to_model_forward(VAN_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndNoAttention,
config_class=_CONFIG_FOR_DOC,
@ -493,7 +491,6 @@ class VanForImageClassification(VanPreTrainedModel):
@add_start_docstrings_to_model_forward(VAN_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,

View File

@ -635,8 +635,8 @@ VILT_INPUTS_DOCSTRING = r"""
[What are token type IDs?](../glossary#token-type-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`ViltFeatureExtractor`]. See
[`ViltFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`ViltImageProcessor`]. See
[`ViltImageProcessor.__call__`] for details.
pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
@ -690,8 +690,8 @@ VILT_IMAGES_AND_TEXT_CLASSIFICATION_INPUTS_DOCSTRING = r"""
[What are token type IDs?](../glossary#token-type-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_images, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`ViltFeatureExtractor`]. See
[`ViltFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`ViltImageProcessor`]. See
[`ViltImageProcessor.__call__`] for details.
pixel_mask (`torch.LongTensor` of shape `(batch_size, num_images, height, width)`, *optional*):
Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:

View File

@ -556,13 +556,13 @@ VISION_TEXT_DUAL_ENCODER_MODEL_DOCSTRING = r"""
>>> from transformers import (
... FlaxVisionTextDualEncoderModel,
... VisionTextDualEncoderProcessor,
... ViTFeatureExtractor,
... ViTImageProcessor,
... BertTokenizer,
... )
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
>>> processor = VisionTextDualEncoderProcessor(feature_extractor, tokenizer)
>>> image_processor = ViTImageProcesor.from_pretrained("google/vit-base-patch16-224")
>>> processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)
>>> model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained(
... "google/vit-base-patch16-224", "bert-base-uncased"
... )

View File

@ -41,7 +41,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "ViTConfig"
_FEAT_EXTRACTOR_FOR_DOC = "ViTImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k"
@ -670,7 +669,6 @@ class TFViTModel(TFViTPreTrainedModel):
@unpack_inputs
@add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
@ -764,7 +762,6 @@ class TFViTForImageClassification(TFViTPreTrainedModel, TFSequenceClassification
@unpack_inputs
@add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,

View File

@ -42,7 +42,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "ViTConfig"
_FEAT_EXTRACTOR_FOR_DOC = "ViTImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k"
@ -536,7 +535,6 @@ class ViTModel(ViTPreTrainedModel):
@add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
@ -765,7 +763,6 @@ class ViTForImageClassification(ViTPreTrainedModel):
@add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,

View File

@ -37,7 +37,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "ViTHybridConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
# Base docstring
_CHECKPOINT_FOR_DOC = "google/vit-hybrid-base-bit-384"
@ -508,8 +507,8 @@ VIT_START_DOCSTRING = r"""
VIT_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
[`AutoFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoImageProcessor.__call__`] for details.
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
@ -560,7 +559,6 @@ class ViTHybridModel(ViTHybridPreTrainedModel):
@add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
@ -664,7 +662,6 @@ class ViTHybridForImageClassification(ViTHybridPreTrainedModel):
@add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,

View File

@ -584,7 +584,7 @@ X_CLIP_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
@ -619,7 +619,7 @@ X_CLIP_INPUTS_DOCSTRING = r"""
[What are position IDs?](../glossary#position-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
output_attentions (`bool`, *optional*):

View File

@ -53,7 +53,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "YolosConfig"
_FEAT_EXTRACTOR_FOR_DOC = "YolosImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "hustvl/yolos-small"
@ -627,7 +626,6 @@ class YolosModel(YolosPreTrainedModel):
@add_start_docstrings_to_model_forward(YOLOS_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,

View File

@ -861,10 +861,10 @@ TF_VISION_BASE_MODEL_SAMPLE = r"""
>>> dataset = load_dataset("huggingface/cats-image")
>>> image = dataset["test"]["image"][0]
>>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
>>> image_processor = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained("{checkpoint}")
>>> inputs = feature_extractor(image, return_tensors="tf")
>>> inputs = image_processor(image, return_tensors="tf")
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
@ -884,10 +884,10 @@ TF_VISION_SEQ_CLASS_SAMPLE = r"""
>>> dataset = load_dataset("huggingface/cats-image")
>>> image = dataset["test"]["image"][0]
>>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
>>> image_processor = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained("{checkpoint}")
>>> inputs = feature_extractor(image, return_tensors="tf")
>>> inputs = image_processor(image, return_tensors="tf")
>>> logits = model(**inputs).logits
>>> # model predicts one of the 1000 ImageNet classes