[docstring] Fix docstring for CodeLlamaTokenizerFast (#26666)

* remove from OBJECTS_TO_IGNORE * run check_docstrings.py * fill in information * ignore CodeLlamaTokenizer
2025-07-31 10:12:23 +06:00 · 2023-10-16 03:11:45 -05:00 · 2023-10-16 03:11:45 -05:00 · 5c081e2993
commit 5c081e2993
parent 69a26c7ecd
2 changed files with 12 additions and 11 deletions
--- a/src/transformers/models/code_llama/tokenization_code_llama_fast.py
+++ b/src/transformers/models/code_llama/tokenization_code_llama_fast.py
@ -75,37 +75,39 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
    which supports prompt infilling.

    Args:
-        vocab_file (`str`):
+        vocab_file (`str`, *optional*):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
-        tokenizer_file (`str`):
+        tokenizer_file (`str`, *optional*):
            [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
            contains everything needed to load the tokenizer.
        clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
            Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
            spaces.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
-        unk_token (`str`, *optional*, defaults to `"<unk>"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
        prefix_token (`str`, *optional*, defaults to `"▁<PRE>"`):
            Prefix token used for infilling.
-        suffix_token (`str`, *optional*, defaults to `"▁<SUF>"`):
-            Suffix token used for infilling.
        middle_token (`str`, *optional*, defaults to `"▁<MID>"`):
            Middle token used for infilling.
+        suffix_token (`str`, *optional*, defaults to `"▁<SUF>"`):
+            Suffix token used for infilling.
        eot_token (`str`, *optional*, defaults to `"▁<EOT>"`):
            End of text token used for infilling.
        fill_token (`str`, *optional*, defaults to `"<FILL_ME>"`):
            The token used to split the input between the prefix and suffix.
-        suffix_first (`bool`, *optional*, default to `False`):
-            Whether the input prompt and suffix should be formatted with the suffix first.
        additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
-        use_default_system_prompt (`bool`, *optional*, defaults to `True`):
+        add_bos_token (`bool`, *optional*, defaults to `True`):
+            Whether to add a beginning of sequence token at the start of sequences.
+        add_eos_token (`bool`, *optional*, defaults to `False`):
+            Whether to add an end of sequence token at the end of sequences.
+        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
            Whether or not the default system prompt for Llama should be used.
    """

--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@ -130,7 +130,6 @@ OBJECTS_TO_IGNORE = [
    "CodeGenConfig",
    "CodeGenTokenizer",
    "CodeGenTokenizerFast",
-    "CodeLlamaTokenizerFast",
    "ConditionalDetrConfig",
    "ConditionalDetrImageProcessor",
    "ConvBertConfig",