From 279000bb7059728166786cd525e35b8152883fdf Mon Sep 17 00:00:00 2001 From: Driss Guessous <32754868+drisspg@users.noreply.github.com> Date: Tue, 3 Jun 2025 11:43:31 -0400 Subject: [PATCH] Name change AOPermod -> ModuleFqn (#38456) Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> Co-authored-by: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com> --- docs/source/en/quantization/torchao.md | 20 +++++++++---------- .../quantizers/quantizer_torchao.py | 8 ++++---- .../torchao_integration/test_torchao.py | 6 +++--- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/docs/source/en/quantization/torchao.md b/docs/source/en/quantization/torchao.md index bee2e008b95..ac9fbc7ca72 100644 --- a/docs/source/en/quantization/torchao.md +++ b/docs/source/en/quantization/torchao.md @@ -62,7 +62,7 @@ Install torchao from PyPi or the PyTorch index with the following commands. # Stable release from Pypi which will default to CUDA 12.6 pip install --upgrade torchao transformers ``` - + Stable Release from the PyTorch index ```bash @@ -276,18 +276,18 @@ print(tokenizer.decode(output[0], skip_special_tokens=True)) ### Per Module Quantization #### 1. Skip quantization for certain layers -With `AOPerModuleConfig` we can specify a default configuration for all layers while skipping quantization for certain layers. +With `ModuleFqnToConfig` we can specify a default configuration for all layers while skipping quantization for certain layers. ```py import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig model_id = "meta-llama/Llama-3.1-8B-Instruct" -from torchao.quantization import Int4WeightOnlyConfig, AOPerModuleConfig +from torchao.quantization import Int4WeightOnlyConfig, ModuleFqnToConfig config = Int4WeightOnlyConfig(group_size=128) # set default to int4 (for linears), and skip quantizing `model.layers.0.self_attn.q_proj` -quant_config = AOPerModuleConfig({"_default": config, "model.layers.0.self_attn.q_proj": None}) +quant_config = ModuleFqnToConfig({"_default": config, "model.layers.0.self_attn.q_proj": None}) quantization_config = TorchAoConfig(quant_type=quant_config) quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config) # lm_head is not quantized and model.layers.0.self_attn.q_proj is not quantized @@ -311,7 +311,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig model_id = "facebook/opt-125m" -from torchao.quantization import Int4WeightOnlyConfig, AOPerModuleConfig, Int8DynamicActivationInt4WeightConfig, IntxWeightOnlyConfig, PerAxis, MappingType +from torchao.quantization import Int4WeightOnlyConfig, ModuleFqnToConfig, Int8DynamicActivationInt4WeightConfig, IntxWeightOnlyConfig, PerAxis, MappingType weight_dtype = torch.int8 granularity = PerAxis(0) @@ -322,7 +322,7 @@ embedding_config = IntxWeightOnlyConfig( mapping_type=mapping_type, ) linear_config = Int8DynamicActivationInt4WeightConfig(group_size=128) -quant_config = AOPerModuleConfig({"_default": linear_config, "model.decoder.embed_tokens": embedding_config, "model.decoder.embed_positions": None}) +quant_config = ModuleFqnToConfig({"_default": linear_config, "model.decoder.embed_tokens": embedding_config, "model.decoder.embed_positions": None}) # set `include_embedding` to True in order to include embedding in quantization # when `include_embedding` is True, we'll remove input embedding from `modules_not_to_convert` as well quantization_config = TorchAoConfig(quant_type=quant_config, include_embedding=True) @@ -427,8 +427,8 @@ quantized_model.save_pretrained(output_dir, safe_serialization=False) # reload the quantized model reloaded_model = AutoModelForCausalLM.from_pretrained( - output_dir, - device_map="auto", + output_dir, + device_map="auto", torch_dtype=torch.bfloat16 ) tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct") @@ -463,8 +463,8 @@ quantized_model.save_pretrained(output_dir, safe_serialization=False) # reload the quantized model reloaded_model = AutoModelForCausalLM.from_pretrained( - output_dir, - device_map="cpu", + output_dir, + device_map="cpu", torch_dtype=torch.bfloat16 ) tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct") diff --git a/src/transformers/quantizers/quantizer_torchao.py b/src/transformers/quantizers/quantizer_torchao.py index 030b0de0c0c..22b2a88ee4d 100644 --- a/src/transformers/quantizers/quantizer_torchao.py +++ b/src/transformers/quantizers/quantizer_torchao.py @@ -261,12 +261,12 @@ class TorchAoHfQuantizer(HfQuantizer): model.tie_weights() setattr(model.config.get_text_config(decoder=True), "tie_word_embeddings", False) - # handle AOPerModuleConfig, introduced in torchao 0.11.0+ - if self.quantization_config._get_ao_version() > version.Version("0.10.0"): - from torchao.quantization import AOPerModuleConfig + # handle ModuleFqnToConfig, introduced in torchao 0.12.0+ + if self.quantization_config._get_ao_version() >= version.Version("0.12.0"): + from torchao.quantization import ModuleFqnToConfig config = self.quantization_config.get_apply_tensor_subclass() - if isinstance(config, AOPerModuleConfig): + if isinstance(config, ModuleFqnToConfig): module_fqn, _ = param_name.rsplit(".", 1) c = None if module_fqn in config.module_fqn_to_config: diff --git a/tests/quantization/torchao_integration/test_torchao.py b/tests/quantization/torchao_integration/test_torchao.py index bf60deef8b6..c756606a95e 100644 --- a/tests/quantization/torchao_integration/test_torchao.py +++ b/tests/quantization/torchao_integration/test_torchao.py @@ -43,10 +43,10 @@ if is_torchao_available(): TensorCoreTiledLayout, ) from torchao.quantization import ( - AOPerModuleConfig, Int8WeightOnlyConfig, IntxWeightOnlyConfig, MappingType, + ModuleFqnToConfig, PerAxis, ) from torchao.quantization.autoquant import AQMixin @@ -226,7 +226,7 @@ class TorchAoTest(unittest.TestCase): granularity=granularity, mapping_type=mapping_type, ) - config = AOPerModuleConfig( + config = ModuleFqnToConfig( {"_default": None, "model.embed_tokens": embedding_config, "lm_head": embedding_config} ) # need set `include_input_output_embeddings` to True @@ -253,7 +253,7 @@ class TorchAoTest(unittest.TestCase): @require_torchao_version_greater_or_equal("0.11.0") def test_per_module_config_skip(self): linear_config = Int8WeightOnlyConfig() - config = AOPerModuleConfig({"_default": linear_config, "model.layers.0.self_attn.q_proj": None}) + config = ModuleFqnToConfig({"_default": linear_config, "model.layers.0.self_attn.q_proj": None}) quant_config = TorchAoConfig(quant_type=config) quantized_model = AutoModelForCausalLM.from_pretrained( self.model_name,