From 0013ba61e5e90c30835aaa946ce9e6c3e00a9070 Mon Sep 17 00:00:00 2001 From: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com> Date: Wed, 12 Mar 2025 20:03:02 +0100 Subject: [PATCH] Fix Failing GPTQ tests (#36666) fix tests --- tests/quantization/gptq/test_gptq.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py index c0056b23866..7d8d3501ce0 100644 --- a/tests/quantization/gptq/test_gptq.py +++ b/tests/quantization/gptq/test_gptq.py @@ -94,6 +94,7 @@ class GPTQTest(unittest.TestCase): EXPECTED_OUTPUTS.add("Hello my name is Aiden, I am a student at the University") EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a member of the N") EXPECTED_OUTPUTS.add("Hello my name is Nellie and I am a student at the") + EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a new member of the") # this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings EXPECTED_RELATIVE_DIFFERENCE = 1.664253062 @@ -260,7 +261,9 @@ class GPTQTest(unittest.TestCase): if self.device_map == "cpu": quant_type = "ipex" if is_ipex_available() else "torch" else: - quant_type = "exllama" + # We expecte tritonv2 to be used here, because exllama backend doesn't support packing https://github.com/ModelCloud/GPTQModel/issues/1354 + # TODO: Remove this once GPTQModel exllama kernels supports packing + quant_type = "tritonv2" quantized_model_from_saved = AutoModelForCausalLM.from_pretrained( tmpdirname, device_map=self.device_map ) @@ -424,10 +427,18 @@ class GPTQTestExllamaV2(unittest.TestCase): cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True) def test_quantized_layers_type(self): - self.assertEqual( - self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE, - "exllama" if is_gptqmodel_available() else "exllamav2", - ) + if is_auto_gptq_available() and not is_gptqmodel_available(): + self.assertEqual( + self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE, + "exllamav2", + ) + else: + # We expecte tritonv2 to be used here, because exllama backend doesn't support packing https://github.com/ModelCloud/GPTQModel/issues/1354 + # TODO: Remove this once GPTQModel exllama kernels supports packing + self.assertEqual( + self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE, + "tritonv2", + ) def check_inference_correctness(self, model): """