mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-03 12:50:06 +06:00
parent
c7eb95581a
commit
0013ba61e5
@ -94,6 +94,7 @@ class GPTQTest(unittest.TestCase):
|
||||
EXPECTED_OUTPUTS.add("Hello my name is Aiden, I am a student at the University")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a member of the N")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is Nellie and I am a student at the")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a new member of the")
|
||||
|
||||
# this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings
|
||||
EXPECTED_RELATIVE_DIFFERENCE = 1.664253062
|
||||
@ -260,7 +261,9 @@ class GPTQTest(unittest.TestCase):
|
||||
if self.device_map == "cpu":
|
||||
quant_type = "ipex" if is_ipex_available() else "torch"
|
||||
else:
|
||||
quant_type = "exllama"
|
||||
# We expecte tritonv2 to be used here, because exllama backend doesn't support packing https://github.com/ModelCloud/GPTQModel/issues/1354
|
||||
# TODO: Remove this once GPTQModel exllama kernels supports packing
|
||||
quant_type = "tritonv2"
|
||||
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
|
||||
tmpdirname, device_map=self.device_map
|
||||
)
|
||||
@ -424,10 +427,18 @@ class GPTQTestExllamaV2(unittest.TestCase):
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
|
||||
|
||||
def test_quantized_layers_type(self):
|
||||
self.assertEqual(
|
||||
self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
|
||||
"exllama" if is_gptqmodel_available() else "exllamav2",
|
||||
)
|
||||
if is_auto_gptq_available() and not is_gptqmodel_available():
|
||||
self.assertEqual(
|
||||
self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
|
||||
"exllamav2",
|
||||
)
|
||||
else:
|
||||
# We expecte tritonv2 to be used here, because exllama backend doesn't support packing https://github.com/ModelCloud/GPTQModel/issues/1354
|
||||
# TODO: Remove this once GPTQModel exllama kernels supports packing
|
||||
self.assertEqual(
|
||||
self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
|
||||
"tritonv2",
|
||||
)
|
||||
|
||||
def check_inference_correctness(self, model):
|
||||
"""
|
||||
|
Loading…
Reference in New Issue
Block a user