Fix Llama 3 TikToken conversion (#33538)

* Fix Llama 3 TikToken conversion

* No need to add tokens again
This commit is contained in:
Pedro Cuenca 2024-09-20 01:28:33 +02:00 committed by GitHub
parent 4d8908df27
commit 0c718f16d1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -332,7 +332,7 @@ def write_model(
class Llama3Converter(TikTokenConverter):
def __init__(self, vocab_file, special_tokens=None, instruct=False, model_max_length=None, **kwargs):
super().__init__(vocab_file, **kwargs)
super().__init__(vocab_file, additional_special_tokens=special_tokens, **kwargs)
tokenizer = self.converted()
chat_template = (
"{% set loop_messages = messages %}"
@ -345,7 +345,6 @@ class Llama3Converter(TikTokenConverter):
"{% endfor %}"
"{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
)
tokenizer.add_special_tokens(special_tokens)
self.tokenizer = PreTrainedTokenizerFast(
tokenizer_object=tokenizer,