mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-03 21:00:08 +06:00
[docstring] Fix docstring for LlamaTokenizer
and LlamaTokenizerFast
(#26669)
* [docstring] Fix docstring for `LlamaTokenizer` and `LlamaTokenizerFast` * [docstring] Fix docstring typo at `LlamaTokenizer` and `LlamaTokenizerFast`
This commit is contained in:
parent
e58cbed51d
commit
aaccf1844e
@ -71,6 +71,43 @@ class LlamaTokenizer(PreTrainedTokenizer):
|
||||
Args:
|
||||
vocab_file (`str`):
|
||||
Path to the vocabulary file.
|
||||
unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
|
||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||
token instead.
|
||||
bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<s>"`):
|
||||
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
|
||||
eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"</s>"`):
|
||||
The end of sequence token.
|
||||
pad_token (`str` or `tokenizers.AddedToken`, *optional*):
|
||||
A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
|
||||
attention mechanisms or loss computation.
|
||||
sp_model_kwargs (`Dict[str, Any]`, `Optional`, *optional*):
|
||||
Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
|
||||
SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
|
||||
to set:
|
||||
|
||||
- `enable_sampling`: Enable subword regularization.
|
||||
- `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
|
||||
|
||||
- `nbest_size = {0,1}`: No sampling is performed.
|
||||
- `nbest_size > 1`: samples from the nbest_size results.
|
||||
- `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
|
||||
using forward-filtering-and-backward-sampling algorithm.
|
||||
|
||||
- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
|
||||
BPE-dropout.
|
||||
|
||||
add_bos_token (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to add an `bos_token` at the start of sequences.
|
||||
add_eos_token (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to add an `eos_token` at the end of sequences.
|
||||
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
|
||||
extra spaces.
|
||||
use_default_system_prompt (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the default system prompt for Llama should be used.
|
||||
spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to add spaces between special tokens.
|
||||
legacy (`bool`, *optional*):
|
||||
Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622
|
||||
and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple
|
||||
|
@ -60,12 +60,12 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
|
||||
|
||||
This uses notably ByteFallback and no normalization.
|
||||
|
||||
```
|
||||
from transformers import LlamaTokenizerFast
|
||||
```python
|
||||
>>> from transformers import LlamaTokenizerFast
|
||||
|
||||
tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
|
||||
tokenizer.encode("Hello this is a test")
|
||||
>>> [1, 15043, 445, 338, 263, 1243]
|
||||
>>> tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
|
||||
>>> tokenizer.encode("Hello this is a test")
|
||||
[1, 15043, 445, 338, 263, 1243]
|
||||
```
|
||||
|
||||
If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
|
||||
@ -78,26 +78,28 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
|
||||
refer to this superclass for more information regarding those methods.
|
||||
|
||||
Args:
|
||||
vocab_file (`str`):
|
||||
vocab_file (`str`, *optional*):
|
||||
[SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
|
||||
contains the vocabulary necessary to instantiate a tokenizer.
|
||||
tokenizer_file (`str`):
|
||||
tokenizer_file (`str`, *optional*):
|
||||
[tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
|
||||
contains everything needed to load the tokenizer.
|
||||
|
||||
clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
|
||||
Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
|
||||
spaces.
|
||||
|
||||
bos_token (`str`, *optional*, defaults to `"<s>"`):
|
||||
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
|
||||
|
||||
eos_token (`str`, *optional*, defaults to `"</s>"`):
|
||||
The end of sequence token.
|
||||
|
||||
unk_token (`str`, *optional*, defaults to `"<unk>"`):
|
||||
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
|
||||
extra spaces.
|
||||
unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
|
||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||
token instead.
|
||||
bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<s>"`):
|
||||
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
|
||||
eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"</s>"`):
|
||||
The end of sequence token.
|
||||
add_bos_token (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to add an `bos_token` at the start of sequences.
|
||||
add_eos_token (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to add an `eos_token` at the end of sequences.
|
||||
use_default_system_prompt (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the default system prompt for Llama should be used.
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
|
@ -361,8 +361,6 @@ OBJECTS_TO_IGNORE = [
|
||||
"LevitConfig",
|
||||
"LiltConfig",
|
||||
"LiltModel",
|
||||
"LlamaTokenizer",
|
||||
"LlamaTokenizerFast",
|
||||
"LongT5Config",
|
||||
"LongformerConfig",
|
||||
"LongformerModel",
|
||||
|
@ -626,7 +626,6 @@ src/transformers/models/lilt/configuration_lilt.py
|
||||
src/transformers/models/llama/configuration_llama.py
|
||||
src/transformers/models/llama/convert_llama_weights_to_hf.py
|
||||
src/transformers/models/llama/modeling_llama.py
|
||||
src/transformers/models/llama/tokenization_llama_fast.py
|
||||
src/transformers/models/longformer/configuration_longformer.py
|
||||
src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
|
||||
src/transformers/models/longt5/configuration_longt5.py
|
||||
|
Loading…
Reference in New Issue
Block a user