mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-04 05:10:06 +06:00
[docstring] Fix docstring for LlamaTokenizer
and LlamaTokenizerFast
(#26669)
* [docstring] Fix docstring for `LlamaTokenizer` and `LlamaTokenizerFast` * [docstring] Fix docstring typo at `LlamaTokenizer` and `LlamaTokenizerFast`
This commit is contained in:
parent
e58cbed51d
commit
aaccf1844e
@ -71,6 +71,43 @@ class LlamaTokenizer(PreTrainedTokenizer):
|
|||||||
Args:
|
Args:
|
||||||
vocab_file (`str`):
|
vocab_file (`str`):
|
||||||
Path to the vocabulary file.
|
Path to the vocabulary file.
|
||||||
|
unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
|
||||||
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||||
|
token instead.
|
||||||
|
bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<s>"`):
|
||||||
|
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
|
||||||
|
eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"</s>"`):
|
||||||
|
The end of sequence token.
|
||||||
|
pad_token (`str` or `tokenizers.AddedToken`, *optional*):
|
||||||
|
A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
|
||||||
|
attention mechanisms or loss computation.
|
||||||
|
sp_model_kwargs (`Dict[str, Any]`, `Optional`, *optional*):
|
||||||
|
Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
|
||||||
|
SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
|
||||||
|
to set:
|
||||||
|
|
||||||
|
- `enable_sampling`: Enable subword regularization.
|
||||||
|
- `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
|
||||||
|
|
||||||
|
- `nbest_size = {0,1}`: No sampling is performed.
|
||||||
|
- `nbest_size > 1`: samples from the nbest_size results.
|
||||||
|
- `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
|
||||||
|
using forward-filtering-and-backward-sampling algorithm.
|
||||||
|
|
||||||
|
- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
|
||||||
|
BPE-dropout.
|
||||||
|
|
||||||
|
add_bos_token (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether or not to add an `bos_token` at the start of sequences.
|
||||||
|
add_eos_token (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether or not to add an `eos_token` at the end of sequences.
|
||||||
|
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
|
||||||
|
extra spaces.
|
||||||
|
use_default_system_prompt (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether or not the default system prompt for Llama should be used.
|
||||||
|
spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether or not to add spaces between special tokens.
|
||||||
legacy (`bool`, *optional*):
|
legacy (`bool`, *optional*):
|
||||||
Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622
|
Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622
|
||||||
and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple
|
and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple
|
||||||
|
@ -60,12 +60,12 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
|
|
||||||
This uses notably ByteFallback and no normalization.
|
This uses notably ByteFallback and no normalization.
|
||||||
|
|
||||||
```
|
```python
|
||||||
from transformers import LlamaTokenizerFast
|
>>> from transformers import LlamaTokenizerFast
|
||||||
|
|
||||||
tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
|
>>> tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
|
||||||
tokenizer.encode("Hello this is a test")
|
>>> tokenizer.encode("Hello this is a test")
|
||||||
>>> [1, 15043, 445, 338, 263, 1243]
|
[1, 15043, 445, 338, 263, 1243]
|
||||||
```
|
```
|
||||||
|
|
||||||
If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
|
If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
|
||||||
@ -78,26 +78,28 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
refer to this superclass for more information regarding those methods.
|
refer to this superclass for more information regarding those methods.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_file (`str`):
|
vocab_file (`str`, *optional*):
|
||||||
[SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
|
[SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
|
||||||
contains the vocabulary necessary to instantiate a tokenizer.
|
contains the vocabulary necessary to instantiate a tokenizer.
|
||||||
tokenizer_file (`str`):
|
tokenizer_file (`str`, *optional*):
|
||||||
[tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
|
[tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
|
||||||
contains everything needed to load the tokenizer.
|
contains everything needed to load the tokenizer.
|
||||||
|
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
|
||||||
clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
|
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
|
||||||
Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
|
extra spaces.
|
||||||
spaces.
|
unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
|
||||||
|
|
||||||
bos_token (`str`, *optional*, defaults to `"<s>"`):
|
|
||||||
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
|
|
||||||
|
|
||||||
eos_token (`str`, *optional*, defaults to `"</s>"`):
|
|
||||||
The end of sequence token.
|
|
||||||
|
|
||||||
unk_token (`str`, *optional*, defaults to `"<unk>"`):
|
|
||||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||||
token instead.
|
token instead.
|
||||||
|
bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<s>"`):
|
||||||
|
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
|
||||||
|
eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"</s>"`):
|
||||||
|
The end of sequence token.
|
||||||
|
add_bos_token (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether or not to add an `bos_token` at the start of sequences.
|
||||||
|
add_eos_token (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether or not to add an `eos_token` at the end of sequences.
|
||||||
|
use_default_system_prompt (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether or not the default system prompt for Llama should be used.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
|
@ -361,8 +361,6 @@ OBJECTS_TO_IGNORE = [
|
|||||||
"LevitConfig",
|
"LevitConfig",
|
||||||
"LiltConfig",
|
"LiltConfig",
|
||||||
"LiltModel",
|
"LiltModel",
|
||||||
"LlamaTokenizer",
|
|
||||||
"LlamaTokenizerFast",
|
|
||||||
"LongT5Config",
|
"LongT5Config",
|
||||||
"LongformerConfig",
|
"LongformerConfig",
|
||||||
"LongformerModel",
|
"LongformerModel",
|
||||||
|
@ -626,7 +626,6 @@ src/transformers/models/lilt/configuration_lilt.py
|
|||||||
src/transformers/models/llama/configuration_llama.py
|
src/transformers/models/llama/configuration_llama.py
|
||||||
src/transformers/models/llama/convert_llama_weights_to_hf.py
|
src/transformers/models/llama/convert_llama_weights_to_hf.py
|
||||||
src/transformers/models/llama/modeling_llama.py
|
src/transformers/models/llama/modeling_llama.py
|
||||||
src/transformers/models/llama/tokenization_llama_fast.py
|
|
||||||
src/transformers/models/longformer/configuration_longformer.py
|
src/transformers/models/longformer/configuration_longformer.py
|
||||||
src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
|
src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
|
||||||
src/transformers/models/longt5/configuration_longt5.py
|
src/transformers/models/longt5/configuration_longt5.py
|
||||||
|
Loading…
Reference in New Issue
Block a user