Improve model loading for compressed tensor models (#36152)

* Disable warnings for stacked compressors * Introduce two new hooks in HfQuantizer lifecycle to allow updates to missing and unexpected keys * Update missing and unexpected keys for stacked compressors * Add tests * Fix: run_compressed cases * Fix: uncompressed cases * Rename compressed_tensor folder to compressed_tensors Move RunCompressedTest to the same file Update tests to unittest
2025-07-31 02:02:21 +06:00 · 2025-02-24 06:47:21 -06:00 · 2025-02-24 06:47:21 -06:00 · 884a8ea1f0
commit 884a8ea1f0
parent 4dbf17c17f
8 changed files with 307 additions and 176 deletions
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@ -4673,6 +4673,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
                unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
        if hf_quantizer is not None:
            missing_keys = hf_quantizer.update_missing_keys(model, missing_keys, prefix)
+            unexpected_keys = hf_quantizer.update_unexpected_keys(model, unexpected_keys, prefix)

        # retrieve weights on meta device and put them back on CPU.
        # This is not ideal in terms of memory, but if we don't do that not, we can't initialize them in the next step
@ -4993,6 +4994,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
                load_offloaded_weights(model_to_load, state_dict_index, state_dict_folder)
                shutil.rmtree(state_dict_folder)

+        if hf_quantizer is not None:
+            missing_keys = hf_quantizer.update_missing_keys_after_loading(model_to_load, missing_keys, prefix)
+
        if len(error_msgs) > 0:
            error_msg = "\n\t".join(error_msgs)
            if "size mismatch" in error_msg:
--- a/src/transformers/quantizers/base.py
+++ b/src/transformers/quantizers/base.py
@ -109,6 +109,27 @@ class HfQuantizer(ABC):
        """
        return missing_keys

+    def update_unexpected_keys(self, model, unexpected_keys: List[str], prefix: str) -> List[str]:
+        """
+        Override this method if you want to adjust the `unexpected_keys`.
+
+        Args:
+            unexpected_keys (`List[str]`, *optional*):
+                The list of unexpected keys in the checkpoint compared to the state dict of the model
+        """
+        return unexpected_keys
+
+    def update_missing_keys_after_loading(self, model, missing_keys: List[str], prefix: str) -> List[str]:
+        """
+        Override this method if you want to adjust the `missing_keys` after loading the model params,
+        but before the model is post-processed.
+
+        Args:
+            missing_keys (`List[str]`, *optional*):
+                The list of missing keys in the checkpoint compared to the state dict of the model
+        """
+        return missing_keys
+
    def update_expected_keys(self, model, expected_keys: List[str], loaded_keys: List[str]) -> List[str]:
        """
        Override this method if you want to adjust the `update_expected_keys`.
--- a/src/transformers/quantizers/quantizer_compressed_tensors.py
+++ b/src/transformers/quantizers/quantizer_compressed_tensors.py
@ -14,6 +14,8 @@


 import os
+import re
+from typing import List

 from ..utils import is_compressed_tensors_available, is_torch_available, logging
 from ..utils.quantization_config import CompressedTensorsConfig
@ -50,6 +52,45 @@ class CompressedTensorsHfQuantizer(HfQuantizer):
        self.run_compressed = quantization_config.run_compressed
        self.quantization_config = quantization_config

+    def update_missing_keys_after_loading(self, model, missing_keys: List[str], prefix: str) -> List[str]:
+        """
+        Update missing keys after loading the model. This is necessary for compressed tensors
+        to load the model correctly. We expect weights to be present in missing keys.
+        The weight's are re-constructed by ModelCompressor in _process_model_after_weight_loading
+
+        This function cleans up expected missing keys and returns the remaining missing keys
+        """
+
+        if self.run_compressed:
+            return missing_keys
+
+        # We expect some keys to be missing for
+        # compresed models
+        # This is fine as the weights are reconstructed by ModelCompressor
+        # in _process_model_after_weight_loading
+
+        expected_missing_keys = self.compressor.get_missing_module_keys(model)
+        return [
+            key for key in missing_keys if not any(re.match(f".*{pattern}", key) for pattern in expected_missing_keys)
+        ]
+
+    def update_unexpected_keys(self, model, unexpected_keys: List[str], prefix: str) -> List[str]:
+        """
+        Override this method if you want to adjust the `unexpected_keys`.
+
+        Args:
+            unexpected_keys (`List[str]`, *optional*):
+                The list of unexpected keys in the checkpoint compared to the state dict of the model
+        """
+
+        if self.run_compressed:
+            return unexpected_keys
+
+        # We expect some unexpected keys in model
+        # safetensors file for compressed models
+        keys_to_ignore = self.compressor.get_unexpected_file_keys(model)
+        return [key for key in unexpected_keys if not any(re.match(f".*{pattern}", key) for pattern in keys_to_ignore)]
+
    def validate_environment(self, *args, **kwargs):
        if not is_compressed_tensors_available():
            raise ImportError(
@ -75,9 +116,11 @@ class CompressedTensorsHfQuantizer(HfQuantizer):

        ct_quantization_config = self.compressor.quantization_config

-        if self.run_compressed and self.is_quantization_compressed:
+        if self.run_compressed:
+            if not self.is_quantization_compressed:
+                raise ValueError("`run_compressed` is only supported for quantized_compressed models")
            apply_quantization_config(model, ct_quantization_config, run_compressed=True)
-        elif not self.is_quantization_compressed:
+        elif self.is_quantized and not self.is_quantization_compressed:
            apply_quantization_config(model, ct_quantization_config)

    def _process_model_after_weight_loading(self, model, **kwargs):
@ -99,6 +142,12 @@ class CompressedTensorsHfQuantizer(HfQuantizer):
                self.compressor.quantization_config.quantization_status = QuantizationStatus.FROZEN
            self.compressor.decompress(model_path=cache_path, model=model)

+    @property
+    def is_quantized(self):
+        return self.quantization_config.quantization_config is not None and bool(
+            self.quantization_config.quantization_config.config_groups
+        )
+
    @property
    def is_quantization_compressed(self):
        from compressed_tensors.quantization import QuantizationStatus
--- a/tests/quantization/compressed_tensor/test_load_sparse_model.py
+++ b/tests/quantization/compressed_tensor/test_load_sparse_model.py
@ -1,80 +0,0 @@
-import gc
-import unittest
-
-from transformers import AutoModelForCausalLM
-from transformers.testing_utils import require_compressed_tensors, require_torch
-from transformers.utils import is_torch_available
-
-
-if is_torch_available():
-    import torch
-
-
-@require_compressed_tensors
-@require_torch
-class CompressedTensorsTest(unittest.TestCase):
-    model_sparse_uncompressed = "horheynm/llama2.c_stories15M_pruned_50.2of4_uncompressed"
-    model_sparse_compressed = "horheynm/llama2.c_stories15M_pruned_50.2of4_compressed"
-
-    prompt = "Paris is the capital of which country?"
-
-    stubs = [model_sparse_uncompressed, model_sparse_compressed]
-
-    def tearDown(self):
-        gc.collect()
-        torch.cuda.empty_cache()
-        gc.collect()
-
-    def test_compressed_uncompressed_model_shapes(self):
-        """
-        Check that the weights are the same between
-         uncompressed and compressed-decompressed model
-        Sparse compressed modules' weights are "packed" and shape/value will
-         differ
-        """
-
-        def _has_nested_attr(obj, attr_path):
-            attrs = attr_path.split(".")
-            for attr in attrs:
-                if not hasattr(obj, attr):
-                    return None
-                obj = getattr(obj, attr)
-            return obj
-
-        from compressed_tensors.quantization.utils import iter_named_leaf_modules
-
-        uncompressed_model = AutoModelForCausalLM.from_pretrained(
-            self.model_sparse_uncompressed,
-        )
-
-        compressed_model_decompressed = AutoModelForCausalLM.from_pretrained(
-            self.model_sparse_compressed,
-        )
-
-        for name, submodule in iter_named_leaf_modules(
-            uncompressed_model,
-        ):
-            if comp_decomp_obj := _has_nested_attr(compressed_model_decompressed, name):
-                if hasattr(submodule, "weight"):
-                    assert torch.equal(submodule.weight, comp_decomp_obj.weight)
-
-    def test_run_compressed_outputs_match(self):
-        """Check that uncompressed and compressed-decompressed model outputs are the same"""
-
-        from transformers import AutoTokenizer
-
-        for stub in self.stubs:
-            tokenizer = AutoTokenizer.from_pretrained(stub)
-            input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids
-
-            uncompressed_model = AutoModelForCausalLM.from_pretrained(
-                self.model_sparse_uncompressed,
-            )
-            output_rc_true = uncompressed_model.generate(input_ids, max_new_tokens=100)
-
-            compressed_model_decompressed = AutoModelForCausalLM.from_pretrained(
-                self.model_sparse_compressed,
-            )
-            output_rc_false = compressed_model_decompressed.generate(input_ids, max_new_tokens=100)
-
-            assert tokenizer.decode(output_rc_true[0]) == tokenizer.decode(output_rc_false[0])
--- a/tests/quantization/compressed_tensor/test_run_compressed_model.py
+++ b/tests/quantization/compressed_tensor/test_run_compressed_model.py
@ -1,94 +0,0 @@
-import gc
-import unittest
-
-from transformers import AutoModelForCausalLM
-from transformers.testing_utils import require_compressed_tensors, require_torch
-from transformers.utils import is_torch_available
-
-
-if is_torch_available():
-    import torch
-
-
-@require_compressed_tensors
-@require_torch
-class CompressedTensorsTest(unittest.TestCase):
-    tinyllama_w4a16 = "nm-testing/tinyllama-w4a16-compressed-hf-quantizer"
-    tinyllama_w8a8 = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
-
-    prompt = "Paris is the capital of which country?"
-
-    stubs = [tinyllama_w4a16, tinyllama_w8a8]
-
-    def tearDown(self):
-        gc.collect()
-        torch.cuda.empty_cache()
-        gc.collect()
-
-    def test_default_run_compressed__True(self):
-        from compressed_tensors.linear.compressed_linear import CompressedLinear
-        from compressed_tensors.quantization.utils import iter_named_leaf_modules
-
-        for stub in self.stubs:
-            model = AutoModelForCausalLM.from_pretrained(
-                stub,
-            )
-            compressed_linear_counts = 0
-
-            for _, submodule in iter_named_leaf_modules(
-                model,
-            ):
-                if isinstance(submodule, CompressedLinear):
-                    compressed_linear_counts += 1
-
-            # some linear models are not compressed - ex. lm_head
-            assert compressed_linear_counts > 0
-
-    def test_default_run_compressed__False(self):
-        from compressed_tensors.linear.compressed_linear import CompressedLinear
-        from compressed_tensors.quantization.utils import iter_named_leaf_modules
-
-        from transformers.utils.quantization_config import CompressedTensorsConfig
-
-        quantization_config = CompressedTensorsConfig(run_compressed=False)
-
-        for stub in self.stubs:
-            model = AutoModelForCausalLM.from_pretrained(
-                stub,
-                quantization_config=quantization_config,
-            )
-            compressed_linear_counts = 0
-
-            for _, submodule in iter_named_leaf_modules(
-                model,
-            ):
-                if isinstance(submodule, CompressedLinear):
-                    compressed_linear_counts += 1
-
-            # No modules should be CompressedLinear
-            assert compressed_linear_counts == 0
-
-    def test_run_compressed_outputs_match(self):
-        """Check that run_compressed=True/False output are the same"""
-
-        from transformers import AutoTokenizer
-        from transformers.utils.quantization_config import CompressedTensorsConfig
-
-        quantization_config = CompressedTensorsConfig(run_compressed=False)
-
-        for stub in self.stubs:
-            tokenizer = AutoTokenizer.from_pretrained(stub)
-            input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids
-
-            model_run_compressed__True = AutoModelForCausalLM.from_pretrained(
-                stub,
-            )
-            output_rc_true = model_run_compressed__True.generate(input_ids, max_new_tokens=100)
-
-            model_run_compressed__False = AutoModelForCausalLM.from_pretrained(
-                stub,
-                quantization_config=quantization_config,
-            )
-            output_rc_false = model_run_compressed__False.generate(input_ids, max_new_tokens=100)
-
-            assert tokenizer.decode(output_rc_true[0]) == tokenizer.decode(output_rc_false[0])
--- a/tests/quantization/compressed_tensors/init.py
+++ b/tests/quantization/compressed_tensors/init.py
--- a/tests/quantization/compressed_tensors/test_compressed_models.py
+++ b/tests/quantization/compressed_tensors/test_compressed_models.py
@ -0,0 +1,231 @@
+import gc
+import unittest
+import warnings
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.testing_utils import require_compressed_tensors, require_torch
+from transformers.utils import is_torch_available
+from transformers.utils.quantization_config import CompressedTensorsConfig
+
+
+if is_torch_available():
+    import torch
+
+
+@require_compressed_tensors
+@require_torch
+class StackCompressedModelTest(unittest.TestCase):
+    # Define stubs as class attributes
+    compressed_uncompressed_model_stubs = [
+        (
+            "nm-testing/llama2.c-stories42M-gsm8k-quantized-only-compressed",
+            "nm-testing/llama2.c-stories42M-gsm8k-quantized-only-uncompressed",
+        ),
+        (
+            "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed",
+            "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-uncompressed",
+        ),
+        (
+            "nm-testing/llama2.c-stories42M-gsm8k-stacked-compressed",
+            "nm-testing/llama2.c-stories42M-gsm8k-stacked-uncompressed",
+        ),
+    ]
+    # Flatten the list for tests that require a single list of stubs.
+    model_stubs = [stub for pair in compressed_uncompressed_model_stubs for stub in pair]
+
+    # For the outputs matching test, use the sparse-only pair.
+    sparse_compressed_model = "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed"
+    sparse_uncompressed_model = "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-uncompressed"
+
+    prompt = "Paris is the capital of which country?"
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+        gc.collect()
+
+    def test_compressed_uncompressed_model_shapes(self):
+        """
+        Verify that the weights of an uncompressed model and its decompressed compressed counterpart match.
+        Note: Weights for sparsely compressed models may differ due to packing.
+        """
+
+        def _has_nested_attr(obj, attr_path):
+            attrs = attr_path.split(".")
+            for attr in attrs:
+                if not hasattr(obj, attr):
+                    return None
+                obj = getattr(obj, attr)
+            return obj
+
+        from compressed_tensors.quantization.utils import iter_named_leaf_modules
+
+        for compressed_model, uncompressed_model in self.compressed_uncompressed_model_stubs:
+            with self.subTest(compressed_model=compressed_model, uncompressed_model=uncompressed_model):
+                uncompressed = AutoModelForCausalLM.from_pretrained(
+                    uncompressed_model,
+                    device_map="auto",
+                    torch_dtype="auto",
+                    quantization_config=CompressedTensorsConfig(run_compressed=False),
+                )
+                compressed_decompressed = AutoModelForCausalLM.from_pretrained(
+                    compressed_model,
+                    device_map="auto",
+                    torch_dtype="auto",
+                    quantization_config=CompressedTensorsConfig(run_compressed=False),
+                )
+
+                for name, submodule in iter_named_leaf_modules(uncompressed):
+                    comp_decomp_obj = _has_nested_attr(compressed_decompressed, name)
+                    if comp_decomp_obj is not None and hasattr(submodule, "weight"):
+                        if "sparse-only" in uncompressed_model:
+                            self.assertTrue(
+                                torch.equal(submodule.weight, comp_decomp_obj.weight),
+                                f"Weight mismatch for module '{name}' in sparse-only model.",
+                            )
+                        else:
+                            self.assertTrue(
+                                torch.allclose(submodule.weight, comp_decomp_obj.weight, atol=0.2),
+                                f"Weight mismatch for module '{name}' in quantized-only or stacked model.",
+                            )
+
+    def test_outputs_match(self):
+        """
+        Ensure that the generated outputs match between the uncompressed model
+        and its decompressed compressed counterpart.
+        """
+        tokenizer = AutoTokenizer.from_pretrained(self.sparse_uncompressed_model)
+        input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids
+
+        uncompressed = AutoModelForCausalLM.from_pretrained(
+            self.sparse_uncompressed_model,
+            device_map="auto",
+            torch_dtype="auto",
+            quantization_config=CompressedTensorsConfig(run_compressed=False),
+        )
+
+        output_uncompressed = uncompressed.generate(input_ids.to(uncompressed.device), max_new_tokens=100)
+
+        decompressed = AutoModelForCausalLM.from_pretrained(
+            self.sparse_compressed_model,
+            device_map="auto",
+            torch_dtype="auto",
+            quantization_config=CompressedTensorsConfig(run_compressed=False),
+        )
+        output_decompressed = decompressed.generate(input_ids.to(decompressed.device), max_new_tokens=100)
+
+        self.assertEqual(
+            tokenizer.decode(output_uncompressed[0]),
+            tokenizer.decode(output_decompressed[0]),
+            "Generated outputs do not match between compressed and uncompressed models.",
+        )
+
+    def test_no_warnings_for_all_models(self):
+        """
+        Confirm that loading any model using compressed tensors does not trigger
+        warnings about missing or unexpected keys.
+        """
+        for model_stub in self.model_stubs:
+            with self.subTest(model_stub=model_stub):
+                with warnings.catch_warnings(record=True) as caught_warnings:
+                    warnings.simplefilter("always")
+                    AutoModelForCausalLM.from_pretrained(
+                        model_stub,
+                        device_map="auto",
+                        torch_dtype="auto",
+                        quantization_config=CompressedTensorsConfig(run_compressed=False),
+                    )
+                    for warning in caught_warnings:
+                        self.assertNotIn(
+                            "missing keys",
+                            str(warning.message).lower(),
+                            f"'missing keys' found in warnings for model {model_stub}",
+                        )
+                        self.assertNotIn(
+                            "unexpected keys",
+                            str(warning.message).lower(),
+                            f"'unexpected keys' found in warnings for model {model_stub}",
+                        )
+
+
+@require_compressed_tensors
+@require_torch
+class RunCompressedTest(unittest.TestCase):
+    tinyllama_w4a16 = "nm-testing/tinyllama-w4a16-compressed-hf-quantizer"
+    tinyllama_w8a8 = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
+
+    prompt = "Paris is the capital of which country?"
+
+    stubs = [tinyllama_w4a16, tinyllama_w8a8]
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+        gc.collect()
+
+    def test_default_run_compressed__True(self):
+        from compressed_tensors.linear.compressed_linear import CompressedLinear
+        from compressed_tensors.quantization.utils import iter_named_leaf_modules
+
+        for stub in self.stubs:
+            model = AutoModelForCausalLM.from_pretrained(
+                stub,
+            )
+            compressed_linear_counts = 0
+
+            for _, submodule in iter_named_leaf_modules(
+                model,
+            ):
+                if isinstance(submodule, CompressedLinear):
+                    compressed_linear_counts += 1
+
+            # some linear models are not compressed - ex. lm_head
+            assert compressed_linear_counts > 0
+
+    def test_default_run_compressed__False(self):
+        from compressed_tensors.linear.compressed_linear import CompressedLinear
+        from compressed_tensors.quantization.utils import iter_named_leaf_modules
+        from transformers.utils.quantization_config import CompressedTensorsConfig
+
+        quantization_config = CompressedTensorsConfig(run_compressed=False)
+
+        for stub in self.stubs:
+            model = AutoModelForCausalLM.from_pretrained(
+                stub,
+                quantization_config=quantization_config,
+            )
+            compressed_linear_counts = 0
+
+            for _, submodule in iter_named_leaf_modules(
+                model,
+            ):
+                if isinstance(submodule, CompressedLinear):
+                    compressed_linear_counts += 1
+
+            # No modules should be CompressedLinear
+            assert compressed_linear_counts == 0
+
+    def test_run_compressed_outputs_match(self):
+        """Check that run_compressed=True/False output are the same"""
+
+        from transformers import AutoTokenizer
+        from transformers.utils.quantization_config import CompressedTensorsConfig
+
+        quantization_config = CompressedTensorsConfig(run_compressed=False)
+
+        for stub in self.stubs:
+            tokenizer = AutoTokenizer.from_pretrained(stub)
+            input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids
+
+            model_run_compressed__True = AutoModelForCausalLM.from_pretrained(
+                stub,
+            )
+            output_rc_true = model_run_compressed__True.generate(input_ids, max_new_tokens=100)
+
+            model_run_compressed__False = AutoModelForCausalLM.from_pretrained(
+                stub,
+                quantization_config=quantization_config,
+            )
+            output_rc_false = model_run_compressed__False.generate(input_ids, max_new_tokens=100)
+
+            assert tokenizer.decode(output_rc_true[0]) == tokenizer.decode(output_rc_false[0])
--- a/tests/quantization/compressed_tensors/test_compressed_tensors.py
+++ b/tests/quantization/compressed_tensors/test_compressed_tensors.py