mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-01 02:31:11 +06:00
Updated documentation and added conversion utility (#34319)
* Updated documentation and added conversion utility * Update docs/source/en/tiktoken.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/tiktoken.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Moved util function to integration folder + allow for str * Update formatting Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Updated formatting * style changes --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
parent
890ea7de93
commit
95c10fedb3
@ -36,3 +36,25 @@ from transformers import AutoTokenizer
|
|||||||
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
|
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_id, subfolder="original")
|
tokenizer = AutoTokenizer.from_pretrained(model_id, subfolder="original")
|
||||||
```
|
```
|
||||||
|
## Create tiktoken tokenizer
|
||||||
|
|
||||||
|
The `tokenizer.model` file contains no information about additional tokens or pattern strings. If these are important, convert the tokenizer to `tokenizer.json`, the appropriate format for [`PreTrainedTokenizerFast`].
|
||||||
|
|
||||||
|
Generate the `tokenizer.model` file with [tiktoken.get_encoding](https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/registry.py#L63) and then convert it to `tokenizer.json` with [`convert_tiktoken_to_fast`].
|
||||||
|
|
||||||
|
```py
|
||||||
|
|
||||||
|
from transformers.integrations.tiktoken import convert_tiktoken_to_fast
|
||||||
|
from tiktoken import get_encoding
|
||||||
|
|
||||||
|
# You can load your custom encoding or the one provided by OpenAI
|
||||||
|
encoding = get_encoding("gpt2")
|
||||||
|
convert_tiktoken_to_fast(encoding, "config/save/dir")
|
||||||
|
```
|
||||||
|
|
||||||
|
The resulting `tokenizer.json` file is saved to the specified directory and can be loaded with [`PreTrainedTokenizerFast`].
|
||||||
|
|
||||||
|
```py
|
||||||
|
tokenizer = PreTrainedTokenizerFast.from_pretrained("config/save/dir")
|
||||||
|
```
|
||||||
|
|
||||||
|
45
src/transformers/integrations/tiktoken.py
Normal file
45
src/transformers/integrations/tiktoken.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from transformers.convert_slow_tokenizer import TikTokenConverter
|
||||||
|
from transformers.tokenization_utils_fast import TIKTOKEN_VOCAB_FILE, TOKENIZER_FILE
|
||||||
|
|
||||||
|
|
||||||
|
def convert_tiktoken_to_fast(encoding: Any, output_dir: str):
|
||||||
|
"""
|
||||||
|
Converts given `tiktoken` encoding to `PretrainedTokenizerFast` and saves the configuration of converted tokenizer
|
||||||
|
on disk.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
encoding (`str` or `tiktoken.Encoding`):
|
||||||
|
Tokenizer from `tiktoken` library. If `encoding` is `str`, the tokenizer will be loaded with
|
||||||
|
`tiktoken.get_encoding(encoding)`.
|
||||||
|
output_dir (`str`):
|
||||||
|
Save path for converted tokenizer configuration file.
|
||||||
|
"""
|
||||||
|
output_dir = Path(output_dir)
|
||||||
|
output_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
save_file = output_dir / "tiktoken" / TIKTOKEN_VOCAB_FILE
|
||||||
|
tokenizer_file = output_dir / TOKENIZER_FILE
|
||||||
|
|
||||||
|
save_file_absolute = str(save_file.absolute())
|
||||||
|
output_file_absolute = str(tokenizer_file.absolute())
|
||||||
|
|
||||||
|
try:
|
||||||
|
from tiktoken import get_encoding
|
||||||
|
from tiktoken.load import dump_tiktoken_bpe
|
||||||
|
|
||||||
|
if isinstance(encoding, str):
|
||||||
|
encoding = get_encoding(encoding)
|
||||||
|
|
||||||
|
dump_tiktoken_bpe(encoding._mergeable_ranks, save_file_absolute)
|
||||||
|
except ImportError:
|
||||||
|
raise ValueError(
|
||||||
|
"`tiktoken` is required to save a `tiktoken` file. Install it with " "`pip install tiktoken`."
|
||||||
|
)
|
||||||
|
|
||||||
|
tokenizer = TikTokenConverter(
|
||||||
|
vocab_file=save_file_absolute, pattern=encoding._pat_str, additional_special_tokens=encoding._special_tokens
|
||||||
|
).tokenizer()
|
||||||
|
tokenizer.save(output_file_absolute)
|
Loading…
Reference in New Issue
Block a user