From a5e6df82c00cb53ffe863008cbbedd813bcc508b Mon Sep 17 00:00:00 2001 From: Isaac Chung <48971969+isaac-chung@users.noreply.github.com> Date: Mon, 9 Oct 2023 18:39:05 +0300 Subject: [PATCH] [docstring] Fix docstrings for `CLIP` (#26691) fix docstrings for vanilla clip --- src/transformers/models/clip/configuration_clip.py | 8 ++++++-- src/transformers/models/clip/tokenization_clip.py | 8 +++++--- .../models/clip/tokenization_clip_fast.py | 14 +++++++++----- utils/check_docstrings.py | 3 --- 4 files changed, 20 insertions(+), 13 deletions(-) diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py index 5cf6d1a7668..f9ecf5f7d46 100644 --- a/src/transformers/models/clip/configuration_clip.py +++ b/src/transformers/models/clip/configuration_clip.py @@ -168,10 +168,14 @@ class CLIPVisionConfig(PretrainedConfig): Dimensionality of the encoder layers and the pooler layer. intermediate_size (`int`, *optional*, defaults to 3072): Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + projection_dim (`int`, *optional*, defaults to 512): + Dimentionality of text and vision projection layers. num_hidden_layers (`int`, *optional*, defaults to 12): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 12): Number of attention heads for each attention layer in the Transformer encoder. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. image_size (`int`, *optional*, defaults to 224): The size (resolution) of each image. patch_size (`int`, *optional*, defaults to 32): @@ -179,13 +183,13 @@ class CLIPVisionConfig(PretrainedConfig): hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. - layer_norm_eps (`float`, *optional*, defaults to 1e-5): + layer_norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon used by the layer normalization layers. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - initializer_factor (`float`, *optional*, defaults to 1): + initializer_factor (`float`, *optional*, defaults to 1.0): A factor for initializing all weight matrices (should be kept to 1, used internally for initialization testing). diff --git a/src/transformers/models/clip/tokenization_clip.py b/src/transformers/models/clip/tokenization_clip.py index 388c455a438..f62ef65c5ed 100644 --- a/src/transformers/models/clip/tokenization_clip.py +++ b/src/transformers/models/clip/tokenization_clip.py @@ -284,13 +284,15 @@ class CLIPTokenizer(PreTrainedTokenizer): errors (`str`, *optional*, defaults to `"replace"`): Paradigm to follow when decoding bytes to UTF-8. See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. - unk_token (`str`, *optional*, defaults to `<|endoftext|>`): + unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. - bos_token (`str`, *optional*, defaults to `<|startoftext|>`): + bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`): The beginning of sequence token. - eos_token (`str`, *optional*, defaults to `<|endoftext|>`): + eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): The end of sequence token. + pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The token used for padding, for example when batching sequences of different lengths. """ vocab_files_names = VOCAB_FILES_NAMES diff --git a/src/transformers/models/clip/tokenization_clip_fast.py b/src/transformers/models/clip/tokenization_clip_fast.py index 75b3e4f4078..3b092b0f8d5 100644 --- a/src/transformers/models/clip/tokenization_clip_fast.py +++ b/src/transformers/models/clip/tokenization_clip_fast.py @@ -56,17 +56,21 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast): refer to this superclass for more information regarding those methods. Args: - vocab_file (`str`): + vocab_file (`str`, *optional*): Path to the vocabulary file. - merges_file (`str`): + merges_file (`str`, *optional*): Path to the merges file. - unk_token (`str`, *optional*, defaults to `<|endoftext|>`): + tokenizer_file (`str`, *optional*): + The path to a tokenizer file to use instead of the vocab file. + unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. - bos_token (`str`, *optional*, defaults to `<|startoftext|>`): + bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`): The beginning of sequence token. - eos_token (`str`, *optional*, defaults to `<|endoftext|>`): + eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): The end of sequence token. + pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The token used for padding, for example when batching sequences of different lengths. """ vocab_files_names = VOCAB_FILES_NAMES diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py index d40c737ab0c..2832e347ab5 100644 --- a/utils/check_docstrings.py +++ b/utils/check_docstrings.py @@ -118,9 +118,6 @@ OBJECTS_TO_IGNORE = [ "BridgeTowerTextConfig", "BridgeTowerVisionConfig", "BrosModel", - "CLIPTokenizer", - "CLIPTokenizerFast", - "CLIPVisionConfig", "CamembertConfig", "CamembertModel", "CamembertTokenizerFast",