fix hqq due to recent modeling changes (#36771)

* fix-hqq * style * test
2025-07-03 12:50:06 +06:00 · 2025-03-18 12:20:27 +01:00 · 2025-03-18 12:20:27 +01:00 · 3017536ebf
commit 3017536ebf
parent e959530b8f
2 changed files with 32 additions and 1 deletions
--- a/src/transformers/quantizers/quantizer_hqq.py
+++ b/src/transformers/quantizers/quantizer_hqq.py
@ -169,7 +169,12 @@ class HqqHfQuantizer(HfQuantizer):
                and tensor_name != "bias"
            )
        else:
-            return isinstance(module, torch.nn.Linear) and tensor_name == "weight"
+            # we need a special path for bias since hqq overwrote load_state_dict for this layer
+            return (
+                isinstance(module, torch.nn.Linear)
+                and tensor_name == "weight"
+                or (isinstance(module, HQQLinear) and tensor_name == "bias")
+            )

    def create_quantized_param(
        self,
@ -194,6 +199,10 @@ class HqqHfQuantizer(HfQuantizer):
        parent_module = find_parent(model, layer_name)
        node = layer_name.split(".")[-1]

+        if tensor_name == "bias":
+            # this should already be set
+            return
+
        # set module state_dict
        module_state_dict = {}
        for k, v in state_dict.items():
--- a/tests/quantization/hqq/test_hqq.py
+++ b/tests/quantization/hqq/test_hqq.py
@ -145,6 +145,28 @@ class HQQTestMultiGPU(unittest.TestCase):
        check_forward(self, hqq_runner.model)


+@slow
+@require_torch_gpu
+@require_accelerate
+@require_hqq
+class HQQTestBias(unittest.TestCase):
+    def tearDown(self):
+        cleanup()
+
+    def test_fp16_quantized_model(self):
+        """
+        Simple LLM model testing fp16 with bias
+        """
+        quant_config = HqqConfig(nbits=8, group_size=64)
+
+        hqq_runner = HQQLLMRunner(
+            model_id="facebook/opt-125m", quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
+        )
+
+        check_hqqlayer(self, hqq_runner.model.model.decoder.layers[0].self_attn.v_proj)
+        check_forward(self, hqq_runner.model)
+
+
@slow
@require_torch_gpu
@require_accelerate