mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-02 19:21:31 +06:00
Update modeling doc strings FE -> IP (#21106)
* Update docs examples FE -> IP * Remove _IMAGE_PROCESSOR_FOR_DOC
This commit is contained in:
parent
5d3cb760a0
commit
91c2278b97
@ -98,7 +98,7 @@ ALTCLIP_VISION_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
|
||||
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
|
||||
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
@ -133,7 +133,7 @@ ALTCLIP_INPUTS_DOCSTRING = r"""
|
||||
[What are position IDs?](../glossary#position-ids)
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
|
||||
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
|
||||
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
|
||||
return_loss (`bool`, *optional*):
|
||||
Whether or not to return the contrastive loss.
|
||||
output_attentions (`bool`, *optional*):
|
||||
|
@ -49,7 +49,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "BeitConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "BeitImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "microsoft/beit-base-patch16-224-pt22k"
|
||||
@ -646,7 +645,6 @@ class BeitModel(BeitPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BeitModelOutputWithPooling,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -844,7 +842,6 @@ class BeitForImageClassification(BeitPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=ImageClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -46,7 +46,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "BitConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "google/bit-50"
|
||||
@ -688,8 +687,8 @@ BIT_START_DOCSTRING = r"""
|
||||
BIT_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
|
||||
[`AutoFeatureExtractor.__call__`] for details.
|
||||
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
|
||||
[`AutoImageProcessor.__call__`] for details.
|
||||
|
||||
output_hidden_states (`bool`, *optional*):
|
||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
||||
@ -723,7 +722,6 @@ class BitModel(BitPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(BIT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutputWithPoolingAndNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -782,7 +780,6 @@ class BitForImageClassification(BitPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(BIT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=ImageClassifierOutputWithNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -810,7 +810,7 @@ CHINESE_CLIP_VISION_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
|
||||
[`ChineseCLIPFeatureExtractor`]. See [`ChineseCLIPFeatureExtractor.__call__`] for details.
|
||||
[`ChineseCLIPImageProcessor`]. See [`ChineseCLIPImageProcessor.__call__`] for details.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
@ -853,7 +853,7 @@ CHINESE_CLIP_INPUTS_DOCSTRING = r"""
|
||||
[What are position IDs?](../glossary#position-ids)
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
|
||||
[`ChineseCLIPFeatureExtractor`]. See [`ChineseCLIPFeatureExtractor.__call__`] for details.
|
||||
[`ChineseCLIPImageProcessor`]. See [`ChineseCLIPImageProcessor.__call__`] for details.
|
||||
return_loss (`bool`, *optional*):
|
||||
Whether or not to return the contrastive loss.
|
||||
output_attentions (`bool`, *optional*):
|
||||
|
@ -521,7 +521,7 @@ CLIP_VISION_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
|
||||
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
|
||||
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
@ -556,7 +556,7 @@ CLIP_INPUTS_DOCSTRING = r"""
|
||||
[What are position IDs?](../glossary#position-ids)
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
|
||||
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
|
||||
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
|
||||
return_loss (`bool`, *optional*):
|
||||
Whether or not to return the contrastive loss.
|
||||
output_attentions (`bool`, *optional*):
|
||||
|
@ -108,7 +108,7 @@ CLIP_VISION_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
|
||||
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
|
||||
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
@ -143,7 +143,7 @@ CLIP_INPUTS_DOCSTRING = r"""
|
||||
[What are position IDs?](../glossary#position-ids)
|
||||
pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
|
||||
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
|
||||
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
@ -893,7 +893,7 @@ class FlaxCLIPPreTrainedModel(FlaxPreTrainedModel):
|
||||
Args:
|
||||
pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained
|
||||
using [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
|
||||
using [`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
|
||||
|
||||
Returns:
|
||||
image_features (`jnp.ndarray` of shape `(batch_size, output_dim`): The image embeddings obtained by
|
||||
|
@ -993,8 +993,8 @@ CLIP_TEXT_INPUTS_DOCSTRING = r"""
|
||||
CLIP_VISION_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See
|
||||
[`CLIPFeatureExtractor.__call__`] for details. output_attentions (`bool`, *optional*): Whether or not to
|
||||
Pixel values. Pixel values can be obtained using [`CLIPImageProcessor`]. See
|
||||
[`CLIPImageProcessor.__call__`] for details. output_attentions (`bool`, *optional*): Whether or not to
|
||||
return the attentions tensors of all attention layers. See `attentions` under returned tensors for more
|
||||
detail. This argument can be used only in eager mode, in graph mode the value in the config will be used
|
||||
instead.
|
||||
@ -1020,8 +1020,8 @@ CLIP_INPUTS_DOCSTRING = r"""
|
||||
|
||||
[What are input IDs?](../glossary#input-ids)
|
||||
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See
|
||||
[`CLIPFeatureExtractor.__call__`] for details.
|
||||
Pixel values. Pixel values can be obtained using [`CLIPImageProcessor`]. See
|
||||
[`CLIPImageProcessor.__call__`] for details.
|
||||
attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
|
||||
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
||||
|
||||
|
@ -530,7 +530,7 @@ CLIPSEG_VISION_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
|
||||
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
|
||||
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
@ -565,7 +565,7 @@ CLIPSEG_INPUTS_DOCSTRING = r"""
|
||||
[What are position IDs?](../glossary#position-ids)
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
|
||||
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
|
||||
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
|
||||
return_loss (`bool`, *optional*):
|
||||
Whether or not to return the contrastive loss.
|
||||
output_attentions (`bool`, *optional*):
|
||||
|
@ -44,7 +44,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "ConvNextConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "ConvNextImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "facebook/convnext-tiny-224"
|
||||
@ -346,7 +345,6 @@ class ConvNextModel(ConvNextPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutputWithPoolingAndNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -414,7 +412,6 @@ class ConvNextForImageClassification(ConvNextPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=ImageClassifierOutputWithNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -35,7 +35,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "CvtConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "microsoft/cvt-13"
|
||||
@ -605,7 +604,6 @@ class CvtModel(CvtPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(CVT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutputWithCLSToken,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -668,7 +666,6 @@ class CvtForImageClassification(CvtPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(CVT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=ImageClassifierOutputWithNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -48,7 +48,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "Data2VecVisionConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "BeitImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "facebook/data2vec-vision-base"
|
||||
@ -660,7 +659,6 @@ class Data2VecVisionModel(Data2VecVisionPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=Data2VecVisionModelOutputWithPooling,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -760,7 +758,6 @@ class Data2VecVisionForImageClassification(Data2VecVisionPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=ImageClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -53,7 +53,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "Data2VecVisionConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "BeitImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "facebook/data2vec-vision-base"
|
||||
@ -894,7 +893,6 @@ class TFData2VecVisionModel(TFData2VecVisionPreTrainedModel):
|
||||
@unpack_inputs
|
||||
@add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=TFData2VecVisionModelOutputWithPooling,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -960,7 +958,6 @@ class TFData2VecVisionForImageClassification(TFData2VecVisionPreTrainedModel, TF
|
||||
@unpack_inputs
|
||||
@add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=TFSequenceClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -44,7 +44,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "DeiTConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "DeiTImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "facebook/deit-base-distilled-patch16-224"
|
||||
@ -483,7 +482,6 @@ class DeiTModel(DeiTPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutputWithPooling,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -854,7 +852,6 @@ class DeiTForImageClassificationWithTeacher(DeiTPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=DeiTForImageClassificationWithTeacherOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -52,7 +52,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "DeiTConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "DeiTImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "facebook/deit-base-distilled-patch16-224"
|
||||
@ -651,7 +650,6 @@ class TFDeiTModel(TFDeiTPreTrainedModel):
|
||||
@unpack_inputs
|
||||
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=TFBaseModelOutputWithPooling,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -1009,7 +1007,6 @@ class TFDeiTForImageClassificationWithTeacher(TFDeiTPreTrainedModel):
|
||||
@unpack_inputs
|
||||
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=TFDeiTForImageClassificationWithTeacherOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -57,7 +57,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "DinatConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "shi-labs/dinat-mini-in1k-224"
|
||||
@ -730,7 +729,6 @@ class DinatModel(DinatPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(DINAT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=DinatModelOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -810,7 +808,6 @@ class DinatForImageClassification(DinatPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(DINAT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=DinatImageClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -43,7 +43,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "DonutSwinConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "https://huggingface.co/naver-clova-ix/donut-base"
|
||||
@ -847,8 +846,8 @@ SWIN_START_DOCSTRING = r"""
|
||||
SWIN_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
|
||||
[`AutoFeatureExtractor.__call__`] for details.
|
||||
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
|
||||
[`AutoImageProcessor.__call__`] for details.
|
||||
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
|
||||
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
|
||||
|
||||
@ -898,7 +897,6 @@ class DonutSwinModel(DonutSwinPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=DonutSwinModelOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -49,7 +49,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "DPTConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "DPTImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "Intel/dpt-large"
|
||||
@ -898,7 +897,6 @@ class DPTModel(DPTPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(DPT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutputWithPoolingAndIntermediateActivations,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -51,7 +51,6 @@ _CHECKPOINT_FOR_DOC = "facebook/flava-full"
|
||||
|
||||
# Codebook docstring
|
||||
_CHECKPOINT_FOR_CODEBOOK_DOC = "facebook/flava-image-codebook"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "FlavaFeatureExtractor"
|
||||
_CONFIG_CLASS_FOR_IMAGE_MODEL_DOC = "FlavaImageConfig"
|
||||
_CONFIG_CLASS_FOR_TEXT_MODEL_DOC = "FlavaTextConfig"
|
||||
_CONFIG_CLASS_FOR_MULTIMODAL_MODEL_DOC = "FlavaMultimodalConfig"
|
||||
@ -750,8 +749,8 @@ FLAVA_INPUTS_DOCSTRING_COMMON = r"""
|
||||
FLAVA_IMAGE_INPUTS_DOCSTRING_BASE = r"""
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Pixel values can be obtained using [`FlavaFeatureExtractor`]. See
|
||||
[`FlavaFeatureExtractor.__call__`] for details.
|
||||
Pixel values. Pixel values can be obtained using [`FlavaImageProcessor`]. See
|
||||
[`FlavaImageProcessor.__call__`] for details.
|
||||
|
||||
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
|
||||
Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
|
||||
@ -926,7 +925,6 @@ class FlavaImageModel(FlavaPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(FLAVA_IMAGE_INPUTS_DOCSTRING.format("batch_size, image_num_patches"))
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutputWithPooling,
|
||||
config_class=_CONFIG_CLASS_FOR_IMAGE_MODEL_DOC,
|
||||
@ -1568,22 +1566,22 @@ class FlavaImageCodebook(FlavaPreTrainedModel):
|
||||
"""
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Codebook pixel values can be obtained using [`FlavaFeatureExtractor`] by passing
|
||||
`return_codebook_pixels=True`. See [`FlavaFeatureExtractor.__call__`] for details.
|
||||
Pixel values. Codebook pixel values can be obtained using [`FlavaImageProcessor`] by passing
|
||||
`return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.
|
||||
|
||||
Examples:
|
||||
```python
|
||||
>>> from PIL import Image
|
||||
>>> import requests
|
||||
>>> from transformers import FlavaFeatureExtractor, FlavaImageCodebook
|
||||
>>> from transformers import FlavaImageProcessor, FlavaImageCodebook
|
||||
|
||||
>>> model = FlavaImageCodebook.from_pretrained("{0}")
|
||||
>>> feature_extractor = FlavaFeatureExtractor.from_pretrained("{0}")
|
||||
>>> image_processor = FlavaImageProcessor.from_pretrained("{0}")
|
||||
|
||||
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||
|
||||
>>> inputs = feature_extractor([image], return_codebook_pixels=True, return_tensors="pt")
|
||||
>>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
|
||||
>>> inputs = dict(pixel_values=inputs.codebook_pixel_values)
|
||||
|
||||
>>> outputs = model.get_codebook_indices(**inputs)
|
||||
@ -1602,23 +1600,23 @@ class FlavaImageCodebook(FlavaPreTrainedModel):
|
||||
"""
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Codebook pixel values can be obtained using [`FlavaFeatureExtractor`] by passing
|
||||
`return_codebook_pixels=True`. See [`FlavaFeatureExtractor.__call__`] for details.
|
||||
Pixel values. Codebook pixel values can be obtained using [`FlavaImageProcessor`] by passing
|
||||
`return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.
|
||||
|
||||
Examples:
|
||||
|
||||
```python
|
||||
>>> from PIL import Image
|
||||
>>> import requests
|
||||
>>> from transformers import FlavaFeatureExtractor, FlavaImageCodebook
|
||||
>>> from transformers import FlavaImageProcessor, FlavaImageCodebook
|
||||
|
||||
>>> model = FlavaImageCodebook.from_pretrained("{0}")
|
||||
>>> feature_extractor = FlavaFeatureExtractor.from_pretrained("{0}")
|
||||
>>> image_processor = FlavaImageProcessor.from_pretrained("{0}")
|
||||
|
||||
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||
|
||||
>>> inputs = feature_extractor([image], return_codebook_pixels=True, return_tensors="pt")
|
||||
>>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
|
||||
>>> inputs = dict(pixel_values=inputs.codebook_pixel_values)
|
||||
|
||||
>>> outputs = model(**inputs)
|
||||
|
@ -916,7 +916,7 @@ GIT_VISION_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
|
||||
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
|
||||
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
|
@ -41,7 +41,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "GLPNConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "GLPNImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "vinvino02/glpn-kitti"
|
||||
@ -503,7 +502,6 @@ class GLPNModel(GLPNPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(GLPN_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -857,7 +857,7 @@ GROUPVIT_VISION_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
|
||||
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
|
||||
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
@ -891,8 +891,8 @@ GROUPVIT_INPUTS_DOCSTRING = r"""
|
||||
|
||||
[What are position IDs?](../glossary#position-ids)
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See
|
||||
[`CLIPFeatureExtractor.__call__`] for details.
|
||||
Pixel values. Pixel values can be obtained using [`CLIPImageProcessor`]. See
|
||||
[`CLIPImageProcessor.__call__`] for details.
|
||||
return_loss (`bool`, *optional*):
|
||||
Whether or not to return the contrastive loss.
|
||||
output_attentions (`bool`, *optional*):
|
||||
|
@ -1555,8 +1555,8 @@ GROUPVIT_TEXT_INPUTS_DOCSTRING = r"""
|
||||
GROUPVIT_VISION_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See
|
||||
[`CLIPFeatureExtractor.__call__`] for details.
|
||||
Pixel values. Pixel values can be obtained using [`CLIPImageProcessor`]. See
|
||||
[`CLIPImageProcessor.__call__`] for details.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
|
||||
@ -1583,8 +1583,8 @@ GROUPVIT_INPUTS_DOCSTRING = r"""
|
||||
|
||||
[What are input IDs?](../glossary#input-ids)
|
||||
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See
|
||||
[`CLIPFeatureExtractor.__call__`] for details.
|
||||
Pixel values. Pixel values can be obtained using [`CLIPImageProcessor`]. See
|
||||
[`CLIPImageProcessor.__call__`] for details.
|
||||
attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
|
||||
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
||||
|
||||
|
@ -38,7 +38,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "LevitConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "LevitImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "facebook/levit-128S"
|
||||
@ -549,7 +548,6 @@ class LevitModel(LevitPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(LEVIT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutputWithPoolingAndNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -618,7 +616,6 @@ class LevitForImageClassification(LevitPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(LEVIT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=ImageClassifierOutputWithNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -711,7 +708,6 @@ class LevitForImageClassificationWithTeacher(LevitPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(LEVIT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=LevitForImageClassificationWithTeacherOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -51,7 +51,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
_CONFIG_FOR_DOC = "MaskFormerConfig"
|
||||
_CHECKPOINT_FOR_DOC = "facebook/maskformer-swin-base-ade"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "MaskFormerImageProcessor"
|
||||
|
||||
MASKFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
||||
"facebook/maskformer-swin-base-ade",
|
||||
|
@ -33,7 +33,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "MobileNetV1Config"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "MobileNetV1ImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "google/mobilenet_v1_1.0_224"
|
||||
@ -355,7 +354,6 @@ class MobileNetV1Model(MobileNetV1PreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(MOBILENET_V1_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutputWithPoolingAndNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -428,7 +426,6 @@ class MobileNetV1ForImageClassification(MobileNetV1PreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(MOBILENET_V1_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=ImageClassifierOutputWithNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -43,7 +43,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "MobileNetV2Config"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "MobileNetV2ImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "google/mobilenet_v2_1.0_224"
|
||||
@ -566,7 +565,6 @@ class MobileNetV2Model(MobileNetV2PreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(MOBILENET_V2_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutputWithPoolingAndNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -639,8 +637,6 @@ class MobileNetV2ForImageClassification(MobileNetV2PreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(MOBILENET_V2_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=ImageClassifierOutputWithNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
|
||||
|
@ -49,7 +49,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "MobileViTConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "MobileViTImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "apple/mobilevit-small"
|
||||
@ -745,7 +744,6 @@ class MobileViTModel(MobileViTPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(MOBILEVIT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutputWithPoolingAndNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -819,7 +817,6 @@ class MobileViTForImageClassification(MobileViTPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(MOBILEVIT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=ImageClassifierOutputWithNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -43,7 +43,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "MobileViTConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "MobileViTImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "apple/mobilevit-small"
|
||||
@ -839,7 +838,6 @@ class TFMobileViTModel(TFMobileViTPreTrainedModel):
|
||||
@unpack_inputs
|
||||
@add_start_docstrings_to_model_forward(MOBILEVIT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=TFBaseModelOutputWithPooling,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -889,7 +887,6 @@ class TFMobileViTForImageClassification(TFMobileViTPreTrainedModel, TFSequenceCl
|
||||
@unpack_inputs
|
||||
@add_start_docstrings_to_model_forward(MOBILEVIT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=TFImageClassifierOutputWithNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -57,7 +57,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "NatConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "shi-labs/nat-mini-in1k-224"
|
||||
@ -708,7 +707,6 @@ class NatModel(NatPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=NatModelOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -788,7 +786,6 @@ class NatForImageClassification(NatPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=NatImageClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -204,7 +204,7 @@ class OwlViTObjectDetectionOutput(ModelOutput):
|
||||
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
|
||||
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
|
||||
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
|
||||
possible padding). You can use [`~OwlViTFeatureExtractor.post_process_object_detection`] to retrieve the
|
||||
possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to retrieve the
|
||||
unnormalized bounding boxes.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, num_max_text_queries, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`OwlViTTextModel`].
|
||||
@ -248,12 +248,12 @@ class OwlViTImageGuidedObjectDetectionOutput(ModelOutput):
|
||||
target_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
|
||||
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
|
||||
values are normalized in [0, 1], relative to the size of each individual target image in the batch
|
||||
(disregarding possible padding). You can use [`~OwlViTFeatureExtractor.post_process_object_detection`] to
|
||||
(disregarding possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to
|
||||
retrieve the unnormalized bounding boxes.
|
||||
query_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
|
||||
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
|
||||
values are normalized in [0, 1], relative to the size of each individual query image in the batch
|
||||
(disregarding possible padding). You can use [`~OwlViTFeatureExtractor.post_process_object_detection`] to
|
||||
(disregarding possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to
|
||||
retrieve the unnormalized bounding boxes.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
|
||||
Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes
|
||||
|
@ -34,7 +34,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "PoolFormerConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "PoolFormerImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "sail/poolformer_s12"
|
||||
@ -326,7 +325,6 @@ class PoolFormerModel(PoolFormerPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(POOLFORMER_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutputWithNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -397,7 +395,6 @@ class PoolFormerForImageClassification(PoolFormerPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(POOLFORMER_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=ImageClassifierOutputWithNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -37,7 +37,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "RegNetConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "facebook/regnet-y-040"
|
||||
@ -341,7 +340,6 @@ class RegNetModel(RegNetPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(REGNET_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutputWithPoolingAndNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -399,7 +397,6 @@ class RegNetForImageClassification(RegNetPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(REGNET_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=ImageClassifierOutputWithNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -35,7 +35,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "RegNetConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "facebook/regnet-y-040"
|
||||
@ -411,7 +410,6 @@ class TFRegNetModel(TFRegNetPreTrainedModel):
|
||||
@unpack_inputs
|
||||
@add_start_docstrings_to_model_forward(REGNET_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=TFBaseModelOutputWithPoolingAndNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -477,7 +475,6 @@ class TFRegNetForImageClassification(TFRegNetPreTrainedModel, TFSequenceClassifi
|
||||
@unpack_inputs
|
||||
@add_start_docstrings_to_model_forward(REGNET_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=TFSequenceClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -43,7 +43,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "ResNetConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "microsoft/resnet-50"
|
||||
@ -312,7 +311,6 @@ class ResNetModel(ResNetPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutputWithPoolingAndNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -369,7 +367,6 @@ class ResNetForImageClassification(ResNetPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=ImageClassifierOutputWithNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -34,7 +34,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "ResNetConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "microsoft/resnet-50"
|
||||
@ -393,7 +392,6 @@ class TFResNetModel(TFResNetPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=TFBaseModelOutputWithPoolingAndNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -458,7 +456,6 @@ class TFResNetForImageClassification(TFResNetPreTrainedModel, TFSequenceClassifi
|
||||
|
||||
@add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=TFImageClassifierOutputWithNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -42,7 +42,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "SegformerConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "SegformerImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "nvidia/mit-b0"
|
||||
@ -529,7 +528,6 @@ class SegformerModel(SegformerPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -589,7 +587,6 @@ class SegformerForImageClassification(SegformerPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=SegFormerImageClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -37,7 +37,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "SegformerConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "SegformerImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "nvidia/mit-b0"
|
||||
@ -606,7 +605,6 @@ class TFSegformerModel(TFSegformerPreTrainedModel):
|
||||
@unpack_inputs
|
||||
@add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=TFBaseModelOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -659,7 +657,6 @@ class TFSegformerForImageClassification(TFSegformerPreTrainedModel, TFSequenceCl
|
||||
@unpack_inputs
|
||||
@add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=TFSequenceClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -44,7 +44,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "SwinConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "microsoft/swin-tiny-patch4-window7-224"
|
||||
@ -963,7 +962,6 @@ class SwinModel(SwinPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=SwinModelOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -1168,7 +1166,6 @@ class SwinForImageClassification(SwinPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=SwinImageClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -47,7 +47,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "SwinConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "microsoft/swin-tiny-patch4-window7-224"
|
||||
@ -1192,7 +1191,6 @@ class TFSwinModel(TFSwinPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=TFSwinModelOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -1429,7 +1427,6 @@ class TFSwinForImageClassification(TFSwinPreTrainedModel, TFSequenceClassificati
|
||||
|
||||
@add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=TFSwinImageClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -43,7 +43,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "Swin2SRConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "caidas/swin2SR-classical-sr-x2-64"
|
||||
@ -823,8 +822,8 @@ SWIN2SR_START_DOCSTRING = r"""
|
||||
SWIN2SR_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
|
||||
[`AutoFeatureExtractor.__call__`] for details.
|
||||
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
|
||||
[`AutoImageProcessor.__call__`] for details.
|
||||
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
|
||||
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
|
||||
|
||||
@ -897,7 +896,6 @@ class Swin2SRModel(Swin2SRPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(SWIN2SR_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -43,7 +43,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "Swinv2Config"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "microsoft/swinv2-tiny-patch4-window8-256"
|
||||
@ -1043,7 +1042,6 @@ class Swinv2Model(Swinv2PreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(SWINV2_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=Swinv2ModelOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -1251,7 +1249,6 @@ class Swinv2ForImageClassification(Swinv2PreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(SWINV2_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=Swinv2ImageClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -38,7 +38,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "VanConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "Visual-Attention-Network/van-base"
|
||||
@ -435,7 +434,6 @@ class VanModel(VanPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(VAN_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutputWithPoolingAndNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -493,7 +491,6 @@ class VanForImageClassification(VanPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(VAN_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=ImageClassifierOutputWithNoAttention,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -635,8 +635,8 @@ VILT_INPUTS_DOCSTRING = r"""
|
||||
[What are token type IDs?](../glossary#token-type-ids)
|
||||
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Pixel values can be obtained using [`ViltFeatureExtractor`]. See
|
||||
[`ViltFeatureExtractor.__call__`] for details.
|
||||
Pixel values. Pixel values can be obtained using [`ViltImageProcessor`]. See
|
||||
[`ViltImageProcessor.__call__`] for details.
|
||||
|
||||
pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
|
||||
Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
|
||||
@ -690,8 +690,8 @@ VILT_IMAGES_AND_TEXT_CLASSIFICATION_INPUTS_DOCSTRING = r"""
|
||||
[What are token type IDs?](../glossary#token-type-ids)
|
||||
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_images, num_channels, height, width)`):
|
||||
Pixel values. Pixel values can be obtained using [`ViltFeatureExtractor`]. See
|
||||
[`ViltFeatureExtractor.__call__`] for details.
|
||||
Pixel values. Pixel values can be obtained using [`ViltImageProcessor`]. See
|
||||
[`ViltImageProcessor.__call__`] for details.
|
||||
|
||||
pixel_mask (`torch.LongTensor` of shape `(batch_size, num_images, height, width)`, *optional*):
|
||||
Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
|
||||
|
@ -556,13 +556,13 @@ VISION_TEXT_DUAL_ENCODER_MODEL_DOCSTRING = r"""
|
||||
>>> from transformers import (
|
||||
... FlaxVisionTextDualEncoderModel,
|
||||
... VisionTextDualEncoderProcessor,
|
||||
... ViTFeatureExtractor,
|
||||
... ViTImageProcessor,
|
||||
... BertTokenizer,
|
||||
... )
|
||||
|
||||
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
||||
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
|
||||
>>> processor = VisionTextDualEncoderProcessor(feature_extractor, tokenizer)
|
||||
>>> image_processor = ViTImageProcesor.from_pretrained("google/vit-base-patch16-224")
|
||||
>>> processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)
|
||||
>>> model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained(
|
||||
... "google/vit-base-patch16-224", "bert-base-uncased"
|
||||
... )
|
||||
|
@ -41,7 +41,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "ViTConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "ViTImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k"
|
||||
@ -670,7 +669,6 @@ class TFViTModel(TFViTPreTrainedModel):
|
||||
@unpack_inputs
|
||||
@add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=TFBaseModelOutputWithPooling,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -764,7 +762,6 @@ class TFViTForImageClassification(TFViTPreTrainedModel, TFSequenceClassification
|
||||
@unpack_inputs
|
||||
@add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=TFSequenceClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -42,7 +42,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "ViTConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "ViTImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k"
|
||||
@ -536,7 +535,6 @@ class ViTModel(ViTPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutputWithPooling,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -765,7 +763,6 @@ class ViTForImageClassification(ViTPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=ImageClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -37,7 +37,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "ViTHybridConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "google/vit-hybrid-base-bit-384"
|
||||
@ -508,8 +507,8 @@ VIT_START_DOCSTRING = r"""
|
||||
VIT_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
|
||||
[`AutoFeatureExtractor.__call__`] for details.
|
||||
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
|
||||
[`AutoImageProcessor.__call__`] for details.
|
||||
|
||||
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
|
||||
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
|
||||
@ -560,7 +559,6 @@ class ViTHybridModel(ViTHybridPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutputWithPooling,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
@ -664,7 +662,6 @@ class ViTHybridForImageClassification(ViTHybridPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=ImageClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -584,7 +584,7 @@ X_CLIP_VISION_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
|
||||
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
|
||||
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
@ -619,7 +619,7 @@ X_CLIP_INPUTS_DOCSTRING = r"""
|
||||
[What are position IDs?](../glossary#position-ids)
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
|
||||
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
|
||||
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
|
||||
return_loss (`bool`, *optional*):
|
||||
Whether or not to return the contrastive loss.
|
||||
output_attentions (`bool`, *optional*):
|
||||
|
@ -53,7 +53,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "YolosConfig"
|
||||
_FEAT_EXTRACTOR_FOR_DOC = "YolosImageProcessor"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "hustvl/yolos-small"
|
||||
@ -627,7 +626,6 @@ class YolosModel(YolosPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(YOLOS_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutputWithPooling,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
|
@ -861,10 +861,10 @@ TF_VISION_BASE_MODEL_SAMPLE = r"""
|
||||
>>> dataset = load_dataset("huggingface/cats-image")
|
||||
>>> image = dataset["test"]["image"][0]
|
||||
|
||||
>>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
|
||||
>>> image_processor = {processor_class}.from_pretrained("{checkpoint}")
|
||||
>>> model = {model_class}.from_pretrained("{checkpoint}")
|
||||
|
||||
>>> inputs = feature_extractor(image, return_tensors="tf")
|
||||
>>> inputs = image_processor(image, return_tensors="tf")
|
||||
>>> outputs = model(**inputs)
|
||||
|
||||
>>> last_hidden_states = outputs.last_hidden_state
|
||||
@ -884,10 +884,10 @@ TF_VISION_SEQ_CLASS_SAMPLE = r"""
|
||||
>>> dataset = load_dataset("huggingface/cats-image")
|
||||
>>> image = dataset["test"]["image"][0]
|
||||
|
||||
>>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
|
||||
>>> image_processor = {processor_class}.from_pretrained("{checkpoint}")
|
||||
>>> model = {model_class}.from_pretrained("{checkpoint}")
|
||||
|
||||
>>> inputs = feature_extractor(image, return_tensors="tf")
|
||||
>>> inputs = image_processor(image, return_tensors="tf")
|
||||
>>> logits = model(**inputs).logits
|
||||
|
||||
>>> # model predicts one of the 1000 ImageNet classes
|
||||
|
Loading…
Reference in New Issue
Block a user