Fix Failing GPTQ tests (#36666)

fix tests
2025-07-03 21:00:08 +06:00 · 2025-03-12 20:03:02 +01:00 · 2025-03-12 20:03:02 +01:00 · 0013ba61e5
commit 0013ba61e5
parent c7eb95581a
1 changed files with 16 additions and 5 deletions
--- a/tests/quantization/gptq/test_gptq.py
+++ b/tests/quantization/gptq/test_gptq.py
@ -94,6 +94,7 @@ class GPTQTest(unittest.TestCase):
    EXPECTED_OUTPUTS.add("Hello my name is Aiden, I am a student at the University")
    EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a member of the N")
    EXPECTED_OUTPUTS.add("Hello my name is Nellie and I am a student at the")
+    EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a new member of the")

    # this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings
    EXPECTED_RELATIVE_DIFFERENCE = 1.664253062
@ -260,7 +261,9 @@ class GPTQTest(unittest.TestCase):
                if self.device_map == "cpu":
                    quant_type = "ipex" if is_ipex_available() else "torch"
                else:
-                    quant_type = "exllama"
+                    # We expecte tritonv2 to be used here, because exllama backend doesn't support packing https://github.com/ModelCloud/GPTQModel/issues/1354
+                    # TODO: Remove this once GPTQModel exllama kernels supports packing
+                    quant_type = "tritonv2"
                quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
                    tmpdirname, device_map=self.device_map
                )
@ -424,9 +427,17 @@ class GPTQTestExllamaV2(unittest.TestCase):
        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)

    def test_quantized_layers_type(self):
+        if is_auto_gptq_available() and not is_gptqmodel_available():
            self.assertEqual(
                self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
-            "exllama" if is_gptqmodel_available() else "exllamav2",
+                "exllamav2",
+            )
+        else:
+            # We expecte tritonv2 to be used here, because exllama backend doesn't support packing https://github.com/ModelCloud/GPTQModel/issues/1354
+            # TODO: Remove this once GPTQModel exllama kernels supports packing
+            self.assertEqual(
+                self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
+                "tritonv2",
            )

    def check_inference_correctness(self, model):