mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-04 05:10:06 +06:00
parent
c7eb95581a
commit
0013ba61e5
@ -94,6 +94,7 @@ class GPTQTest(unittest.TestCase):
|
|||||||
EXPECTED_OUTPUTS.add("Hello my name is Aiden, I am a student at the University")
|
EXPECTED_OUTPUTS.add("Hello my name is Aiden, I am a student at the University")
|
||||||
EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a member of the N")
|
EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a member of the N")
|
||||||
EXPECTED_OUTPUTS.add("Hello my name is Nellie and I am a student at the")
|
EXPECTED_OUTPUTS.add("Hello my name is Nellie and I am a student at the")
|
||||||
|
EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a new member of the")
|
||||||
|
|
||||||
# this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings
|
# this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings
|
||||||
EXPECTED_RELATIVE_DIFFERENCE = 1.664253062
|
EXPECTED_RELATIVE_DIFFERENCE = 1.664253062
|
||||||
@ -260,7 +261,9 @@ class GPTQTest(unittest.TestCase):
|
|||||||
if self.device_map == "cpu":
|
if self.device_map == "cpu":
|
||||||
quant_type = "ipex" if is_ipex_available() else "torch"
|
quant_type = "ipex" if is_ipex_available() else "torch"
|
||||||
else:
|
else:
|
||||||
quant_type = "exllama"
|
# We expecte tritonv2 to be used here, because exllama backend doesn't support packing https://github.com/ModelCloud/GPTQModel/issues/1354
|
||||||
|
# TODO: Remove this once GPTQModel exllama kernels supports packing
|
||||||
|
quant_type = "tritonv2"
|
||||||
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
|
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
|
||||||
tmpdirname, device_map=self.device_map
|
tmpdirname, device_map=self.device_map
|
||||||
)
|
)
|
||||||
@ -424,9 +427,17 @@ class GPTQTestExllamaV2(unittest.TestCase):
|
|||||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
|
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
|
||||||
|
|
||||||
def test_quantized_layers_type(self):
|
def test_quantized_layers_type(self):
|
||||||
|
if is_auto_gptq_available() and not is_gptqmodel_available():
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
|
self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
|
||||||
"exllama" if is_gptqmodel_available() else "exllamav2",
|
"exllamav2",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# We expecte tritonv2 to be used here, because exllama backend doesn't support packing https://github.com/ModelCloud/GPTQModel/issues/1354
|
||||||
|
# TODO: Remove this once GPTQModel exllama kernels supports packing
|
||||||
|
self.assertEqual(
|
||||||
|
self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
|
||||||
|
"tritonv2",
|
||||||
)
|
)
|
||||||
|
|
||||||
def check_inference_correctness(self, model):
|
def check_inference_correctness(self, model):
|
||||||
|
Loading…
Reference in New Issue
Block a user