[docstring] Fix docstring for ChineseCLIP (#26880)

* Remove ChineseCLIPImageProcessor, ChineseCLIPTextConfig, ChineseCLIPVisionConfig from check_docstrings * Run fix_and_overwrite for ChineseCLIPImageProcessor, ChineseCLIPTextConfig, ChineseCLIPVisionConfig * Replace <fill_type> and <fill_docstring> in configuration_chinese_clip.py, image_processing_chinese_clip.py with type and docstring values --------- Co-authored-by: vignesh-raghunathan <vignesh_raghunathan@intuit.com>
2025-07-31 02:02:21 +06:00 · 2023-10-19 04:52:14 -04:00 · 2023-10-19 04:52:14 -04:00 · 816c2237c1
commit 816c2237c1
parent 574a538455
3 changed files with 13 additions and 7 deletions
--- a/src/transformers/models/chinese_clip/configuration_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/configuration_chinese_clip.py
@ -75,8 +75,13 @@ class ChineseCLIPTextConfig(PretrainedConfig):
            The vocabulary size of the `token_type_ids` passed when calling [`ChineseCLIPModel`].
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
@ -177,10 +182,14 @@ class ChineseCLIPVisionConfig(PretrainedConfig):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimentionality of text and vision projection layers.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
        image_size (`int`, *optional*, defaults to 224):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 32):
@ -188,13 +197,13 @@ class ChineseCLIPVisionConfig(PretrainedConfig):
        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float``, *optional*, defaults to 1):
+        initializer_factor (`float``, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).
    Example:
--- a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
@ -59,7 +59,7 @@ class ChineseCLIPImageProcessor(BaseImageProcessor):
            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
            method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
        do_center_crop (`bool`, *optional*, defaults to `True`):
            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
@ -73,7 +73,7 @@ class ChineseCLIPImageProcessor(BaseImageProcessor):
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
            method.
-        do_normalize:
+        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@ -121,10 +121,7 @@ OBJECTS_TO_IGNORE = [
    "CamembertTokenizerFast",
    "CanineModel",
    "CanineTokenizer",
-    "ChineseCLIPImageProcessor",
-    "ChineseCLIPTextConfig",
    "ChineseCLIPTextModel",
-    "ChineseCLIPVisionConfig",
    "ClapTextConfig",
    "CodeGenConfig",
    "CodeGenTokenizer",