diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py index 907ddd65bbe..dcf1d8660f0 100644 --- a/src/transformers/models/llama/tokenization_llama.py +++ b/src/transformers/models/llama/tokenization_llama.py @@ -71,6 +71,43 @@ class LlamaTokenizer(PreTrainedTokenizer): Args: vocab_file (`str`): Path to the vocabulary file. + unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`): + The end of sequence token. + pad_token (`str` or `tokenizers.AddedToken`, *optional*): + A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by + attention mechanisms or loss computation. + sp_model_kwargs (`Dict[str, Any]`, `Optional`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + + add_bos_token (`bool`, *optional*, defaults to `True`): + Whether or not to add an `bos_token` at the start of sequences. + add_eos_token (`bool`, *optional*, defaults to `False`): + Whether or not to add an `eos_token` at the end of sequences. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. + use_default_system_prompt (`bool`, *optional*, defaults to `True`): + Whether or not the default system prompt for Llama should be used. + spaces_between_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to add spaces between special tokens. legacy (`bool`, *optional*): Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622 and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py index 6e9cd2aa3ba..229272e0045 100644 --- a/src/transformers/models/llama/tokenization_llama_fast.py +++ b/src/transformers/models/llama/tokenization_llama_fast.py @@ -60,12 +60,12 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast): This uses notably ByteFallback and no normalization. - ``` - from transformers import LlamaTokenizerFast + ```python + >>> from transformers import LlamaTokenizerFast - tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer") - tokenizer.encode("Hello this is a test") - >>> [1, 15043, 445, 338, 263, 1243] + >>> tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer") + >>> tokenizer.encode("Hello this is a test") + [1, 15043, 445, 338, 263, 1243] ``` If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or @@ -78,26 +78,28 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast): refer to this superclass for more information regarding those methods. Args: - vocab_file (`str`): + vocab_file (`str`, *optional*): [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that contains the vocabulary necessary to instantiate a tokenizer. - tokenizer_file (`str`): + tokenizer_file (`str`, *optional*): [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that contains everything needed to load the tokenizer. - - clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`): - Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra - spaces. - - bos_token (`str`, *optional*, defaults to `""`): - The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. - - eos_token (`str`, *optional*, defaults to `""`): - The end of sequence token. - - unk_token (`str`, *optional*, defaults to `""`): + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. + unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. + bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`): + The end of sequence token. + add_bos_token (`bool`, *optional*, defaults to `True`): + Whether or not to add an `bos_token` at the start of sequences. + add_eos_token (`bool`, *optional*, defaults to `False`): + Whether or not to add an `eos_token` at the end of sequences. + use_default_system_prompt (`bool`, *optional*, defaults to `True`): + Whether or not the default system prompt for Llama should be used. """ vocab_files_names = VOCAB_FILES_NAMES diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py index f142c5dbccd..c86d0ae8e64 100644 --- a/utils/check_docstrings.py +++ b/utils/check_docstrings.py @@ -361,8 +361,6 @@ OBJECTS_TO_IGNORE = [ "LevitConfig", "LiltConfig", "LiltModel", - "LlamaTokenizer", - "LlamaTokenizerFast", "LongT5Config", "LongformerConfig", "LongformerModel", diff --git a/utils/not_doctested.txt b/utils/not_doctested.txt index 617385cabb9..4ff70b0cbe7 100644 --- a/utils/not_doctested.txt +++ b/utils/not_doctested.txt @@ -626,7 +626,6 @@ src/transformers/models/lilt/configuration_lilt.py src/transformers/models/llama/configuration_llama.py src/transformers/models/llama/convert_llama_weights_to_hf.py src/transformers/models/llama/modeling_llama.py -src/transformers/models/llama/tokenization_llama_fast.py src/transformers/models/longformer/configuration_longformer.py src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py src/transformers/models/longt5/configuration_longt5.py