fix hqq due to recent modeling changes (#36771)

* fix-hqq * style * test
2025-07-03 12:50:06 +06:00 · 2025-03-18 12:20:27 +01:00 · 2025-03-18 12:20:27 +01:00 · 3017536ebf
commit 3017536ebf
parent e959530b8f
2 changed files with 32 additions and 1 deletions
--- a/src/transformers/quantizers/quantizer_hqq.py
+++ b/src/transformers/quantizers/quantizer_hqq.py
@ -169,7 +169,12 @@ class HqqHfQuantizer(HfQuantizer):
                and tensor_name != "bias"
            )
        else:
-            return isinstance(module, torch.nn.Linear) and tensor_name == "weight"
+            # we need a special path for bias since hqq overwrote load_state_dict for this layer
            return (
                isinstance(module, torch.nn.Linear)
                and tensor_name == "weight"
                or (isinstance(module, HQQLinear) and tensor_name == "bias")
            )
    def create_quantized_param(
        self,
@ -194,6 +199,10 @@ class HqqHfQuantizer(HfQuantizer):
        parent_module = find_parent(model, layer_name)
        node = layer_name.split(".")[-1]
        if tensor_name == "bias":
            # this should already be set
            return
        # set module state_dict
        module_state_dict = {}
        for k, v in state_dict.items():
--- a/tests/quantization/hqq/test_hqq.py
+++ b/tests/quantization/hqq/test_hqq.py
@ -145,6 +145,28 @@ class HQQTestMultiGPU(unittest.TestCase):
        check_forward(self, hqq_runner.model)
@slow
@require_torch_gpu
@require_accelerate
@require_hqq
 class HQQTestBias(unittest.TestCase):
    def tearDown(self):
        cleanup()
    def test_fp16_quantized_model(self):
        """
        Simple LLM model testing fp16 with bias
        """
        quant_config = HqqConfig(nbits=8, group_size=64)
        hqq_runner = HQQLLMRunner(
            model_id="facebook/opt-125m", quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
        )
        check_hqqlayer(self, hqq_runner.model.model.decoder.layers[0].self_attn.v_proj)
        check_forward(self, hqq_runner.model)
@slow
@require_torch_gpu
@require_accelerate