mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 10:12:23 +06:00
[docstring] Fix docstring for CodeLlamaTokenizerFast
(#26666)
* remove from OBJECTS_TO_IGNORE * run check_docstrings.py * fill in information * ignore CodeLlamaTokenizer
This commit is contained in:
parent
69a26c7ecd
commit
5c081e2993
@ -75,37 +75,39 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
|
||||
which supports prompt infilling.
|
||||
|
||||
Args:
|
||||
vocab_file (`str`):
|
||||
vocab_file (`str`, *optional*):
|
||||
[SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
|
||||
contains the vocabulary necessary to instantiate a tokenizer.
|
||||
tokenizer_file (`str`):
|
||||
tokenizer_file (`str`, *optional*):
|
||||
[tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
|
||||
contains everything needed to load the tokenizer.
|
||||
clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
|
||||
Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
|
||||
spaces.
|
||||
unk_token (`str`, *optional*, defaults to `"<unk>"`):
|
||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||
token instead.
|
||||
bos_token (`str`, *optional*, defaults to `"<s>"`):
|
||||
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
|
||||
eos_token (`str`, *optional*, defaults to `"</s>"`):
|
||||
The end of sequence token.
|
||||
unk_token (`str`, *optional*, defaults to `"<unk>"`):
|
||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||
token instead.
|
||||
prefix_token (`str`, *optional*, defaults to `"▁<PRE>"`):
|
||||
Prefix token used for infilling.
|
||||
suffix_token (`str`, *optional*, defaults to `"▁<SUF>"`):
|
||||
Suffix token used for infilling.
|
||||
middle_token (`str`, *optional*, defaults to `"▁<MID>"`):
|
||||
Middle token used for infilling.
|
||||
suffix_token (`str`, *optional*, defaults to `"▁<SUF>"`):
|
||||
Suffix token used for infilling.
|
||||
eot_token (`str`, *optional*, defaults to `"▁<EOT>"`):
|
||||
End of text token used for infilling.
|
||||
fill_token (`str`, *optional*, defaults to `"<FILL_ME>"`):
|
||||
The token used to split the input between the prefix and suffix.
|
||||
suffix_first (`bool`, *optional*, default to `False`):
|
||||
Whether the input prompt and suffix should be formatted with the suffix first.
|
||||
additional_special_tokens (`List[str]`, *optional*):
|
||||
Additional special tokens used by the tokenizer.
|
||||
use_default_system_prompt (`bool`, *optional*, defaults to `True`):
|
||||
add_bos_token (`bool`, *optional*, defaults to `True`):
|
||||
Whether to add a beginning of sequence token at the start of sequences.
|
||||
add_eos_token (`bool`, *optional*, defaults to `False`):
|
||||
Whether to add an end of sequence token at the end of sequences.
|
||||
use_default_system_prompt (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the default system prompt for Llama should be used.
|
||||
"""
|
||||
|
||||
|
@ -130,7 +130,6 @@ OBJECTS_TO_IGNORE = [
|
||||
"CodeGenConfig",
|
||||
"CodeGenTokenizer",
|
||||
"CodeGenTokenizerFast",
|
||||
"CodeLlamaTokenizerFast",
|
||||
"ConditionalDetrConfig",
|
||||
"ConditionalDetrImageProcessor",
|
||||
"ConvBertConfig",
|
||||
|
Loading…
Reference in New Issue
Block a user