diff --git a/src/transformers/quantizers/quantizer_hqq.py b/src/transformers/quantizers/quantizer_hqq.py index 93ab958a30c..60d334fdd9b 100755 --- a/src/transformers/quantizers/quantizer_hqq.py +++ b/src/transformers/quantizers/quantizer_hqq.py @@ -169,7 +169,12 @@ class HqqHfQuantizer(HfQuantizer): and tensor_name != "bias" ) else: - return isinstance(module, torch.nn.Linear) and tensor_name == "weight" + # we need a special path for bias since hqq overwrote load_state_dict for this layer + return ( + isinstance(module, torch.nn.Linear) + and tensor_name == "weight" + or (isinstance(module, HQQLinear) and tensor_name == "bias") + ) def create_quantized_param( self, @@ -194,6 +199,10 @@ class HqqHfQuantizer(HfQuantizer): parent_module = find_parent(model, layer_name) node = layer_name.split(".")[-1] + if tensor_name == "bias": + # this should already be set + return + # set module state_dict module_state_dict = {} for k, v in state_dict.items(): diff --git a/tests/quantization/hqq/test_hqq.py b/tests/quantization/hqq/test_hqq.py index c25aada6ed4..7335a937086 100755 --- a/tests/quantization/hqq/test_hqq.py +++ b/tests/quantization/hqq/test_hqq.py @@ -145,6 +145,28 @@ class HQQTestMultiGPU(unittest.TestCase): check_forward(self, hqq_runner.model) +@slow +@require_torch_gpu +@require_accelerate +@require_hqq +class HQQTestBias(unittest.TestCase): + def tearDown(self): + cleanup() + + def test_fp16_quantized_model(self): + """ + Simple LLM model testing fp16 with bias + """ + quant_config = HqqConfig(nbits=8, group_size=64) + + hqq_runner = HQQLLMRunner( + model_id="facebook/opt-125m", quant_config=quant_config, compute_dtype=torch.float16, device=torch_device + ) + + check_hqqlayer(self, hqq_runner.model.model.decoder.layers[0].self_attn.v_proj) + check_forward(self, hqq_runner.model) + + @slow @require_torch_gpu @require_accelerate