[docstring] Fix docstring for LlamaTokenizer and LlamaTokenizerFast (#26669)

* [docstring] Fix docstring for `LlamaTokenizer` and `LlamaTokenizerFast`

* [docstring] Fix docstring typo at `LlamaTokenizer` and `LlamaTokenizerFast`
This commit is contained in:
Minho Ryang 2023-10-12 00:03:31 +09:00 committed by GitHub
parent e58cbed51d
commit aaccf1844e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 58 additions and 22 deletions

View File

@ -71,6 +71,43 @@ class LlamaTokenizer(PreTrainedTokenizer):
Args:
vocab_file (`str`):
Path to the vocabulary file.
unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"</s>"`):
The end of sequence token.
pad_token (`str` or `tokenizers.AddedToken`, *optional*):
A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
attention mechanisms or loss computation.
sp_model_kwargs (`Dict[str, Any]`, `Optional`, *optional*):
Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
to set:
- `enable_sampling`: Enable subword regularization.
- `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- `nbest_size = {0,1}`: No sampling is performed.
- `nbest_size > 1`: samples from the nbest_size results.
- `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
add_bos_token (`bool`, *optional*, defaults to `True`):
Whether or not to add an `bos_token` at the start of sequences.
add_eos_token (`bool`, *optional*, defaults to `False`):
Whether or not to add an `eos_token` at the end of sequences.
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
use_default_system_prompt (`bool`, *optional*, defaults to `True`):
Whether or not the default system prompt for Llama should be used.
spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to add spaces between special tokens.
legacy (`bool`, *optional*):
Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622
and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple

View File

@ -60,12 +60,12 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
This uses notably ByteFallback and no normalization.
```
from transformers import LlamaTokenizerFast
```python
>>> from transformers import LlamaTokenizerFast
tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
tokenizer.encode("Hello this is a test")
>>> [1, 15043, 445, 338, 263, 1243]
>>> tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
>>> tokenizer.encode("Hello this is a test")
[1, 15043, 445, 338, 263, 1243]
```
If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
@ -78,26 +78,28 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
refer to this superclass for more information regarding those methods.
Args:
vocab_file (`str`):
vocab_file (`str`, *optional*):
[SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
contains the vocabulary necessary to instantiate a tokenizer.
tokenizer_file (`str`):
tokenizer_file (`str`, *optional*):
[tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
contains everything needed to load the tokenizer.
clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
spaces.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"</s>"`):
The end of sequence token.
add_bos_token (`bool`, *optional*, defaults to `True`):
Whether or not to add an `bos_token` at the start of sequences.
add_eos_token (`bool`, *optional*, defaults to `False`):
Whether or not to add an `eos_token` at the end of sequences.
use_default_system_prompt (`bool`, *optional*, defaults to `True`):
Whether or not the default system prompt for Llama should be used.
"""
vocab_files_names = VOCAB_FILES_NAMES

View File

@ -361,8 +361,6 @@ OBJECTS_TO_IGNORE = [
"LevitConfig",
"LiltConfig",
"LiltModel",
"LlamaTokenizer",
"LlamaTokenizerFast",
"LongT5Config",
"LongformerConfig",
"LongformerModel",

View File

@ -626,7 +626,6 @@ src/transformers/models/lilt/configuration_lilt.py
src/transformers/models/llama/configuration_llama.py
src/transformers/models/llama/convert_llama_weights_to_hf.py
src/transformers/models/llama/modeling_llama.py
src/transformers/models/llama/tokenization_llama_fast.py
src/transformers/models/longformer/configuration_longformer.py
src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
src/transformers/models/longt5/configuration_longt5.py