mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-03 11:41:51 +06:00
gguf conversion add_prefix_space=None for llama3 (#31937)
* gguf conversion forces add_prefix_space=False for llama3, this is not required and forces from_slow, which fails. changing to None + test * typo * clean test
This commit is contained in:
parent
2e113422b3
commit
a1844a3209
@ -609,7 +609,7 @@ class GGUFLlamaConverter(LlamaConverter):
|
|||||||
self.additional_kwargs["bos_token"] = eos_token
|
self.additional_kwargs["bos_token"] = eos_token
|
||||||
|
|
||||||
if self.is_llama_3_tokenizer:
|
if self.is_llama_3_tokenizer:
|
||||||
self.additional_kwargs["add_prefix_space"] = False
|
self.additional_kwargs["add_prefix_space"] = None
|
||||||
self.additional_kwargs["clean_up_tokenization_spaces"] = True
|
self.additional_kwargs["clean_up_tokenization_spaces"] = True
|
||||||
|
|
||||||
self.additional_kwargs["legacy"] = False
|
self.additional_kwargs["legacy"] = False
|
||||||
|
@ -174,10 +174,13 @@ class GgufIntegrationTests(unittest.TestCase):
|
|||||||
self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
|
self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
|
||||||
|
|
||||||
def test_llama3_q4_0_tokenizer(self):
|
def test_llama3_q4_0_tokenizer(self):
|
||||||
tokenizer_gguf = AutoTokenizer.from_pretrained(self.llama3_model_id, gguf_file=self.q4_llama3_model_id)
|
tokenizer = AutoTokenizer.from_pretrained(self.llama3_model_id, gguf_file=self.q4_llama3_model_id)
|
||||||
special_sentence = "สวัสดี"
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||||
predicted_text = tokenizer_gguf.decode(tokenizer_gguf.encode(special_sentence, return_tensors="pt")[0])
|
tokenizer.save_pretrained(tmpdirname)
|
||||||
self.assertEqual(predicted_text, "<|begin_of_text|>" + special_sentence)
|
tokenizer = AutoTokenizer.from_pretrained(tmpdirname)
|
||||||
|
special_sentence = "สวัสดี"
|
||||||
|
predicted_text = tokenizer.decode(tokenizer.encode(special_sentence, return_tensors="pt")[0])
|
||||||
|
self.assertEqual(predicted_text, "<|begin_of_text|>" + special_sentence)
|
||||||
|
|
||||||
def test_llama3_q4_0(self):
|
def test_llama3_q4_0(self):
|
||||||
tokenizer = AutoTokenizer.from_pretrained(self.llama3_model_id, gguf_file=self.q4_llama3_model_id)
|
tokenizer = AutoTokenizer.from_pretrained(self.llama3_model_id, gguf_file=self.q4_llama3_model_id)
|
||||||
|
Loading…
Reference in New Issue
Block a user