From e1812864ab7b482b974591523eb9520480c1a0b3 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jessecai@fb.com>
Date: Thu, 12 Jun 2025 08:17:07 -0400
Subject: [PATCH] [docs] Add int4wo + 2:4 sparsity example to TorchAO README
 (#38592)

* update quantization readme

* update

---------

Co-authored-by: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com>
---
 docs/source/en/quantization/torchao.md | 63 ++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
diff --git a/docs/source/en/quantization/torchao.md b/docs/source/en/quantization/torchao.md
index 164f6851f32..6269294a332 100644
--- a/docs/source/en/quantization/torchao.md
+++ b/docs/source/en/quantization/torchao.md
@@ -38,6 +38,7 @@ torchao supports the [quantization techniques](https://github.com/pytorch/ao/blo
 - A8W8 Int8 Dynamic Quantization
 - A16W8 Int8 Weight Only Quantization
 - A16W4 Int4 Weight Only Quantization
+- A16W4 Int4 Weight Only Quantization + 2:4 Sparsity
 - Autoquantization
 
 torchao also supports module level configuration by specifying a dictionary from fully qualified name of module and its corresponding quantization config. This allows skip quantizing certain layers and using different quantization config for different modules.
@@ -147,6 +148,37 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 </hfoption>
 </hfoptions>
 
+</hfoption>
+<hfoption id="int4-weight-only-24sparse">
+
+```py
+import torch
+from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+from torchao.quantization import Int4WeightOnlyConfig
+from torchao.dtypes import MarlinSparseLayout
+
+quant_config = Int4WeightOnlyConfig(layout=MarlinSparseLayout())
+quantization_config = TorchAoConfig(quant_type=quant_config)
+
+# Load and quantize the model with sparsity. A sparse checkpoint is needed to accelerate without accuraccy loss
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    "RedHatAI/Sparse-Llama-3.1-8B-2of4",
+    torch_dtype=torch.float16,
+    device_map="cuda",
+    quantization_config=quantization_config
+)
+
+tokenizer = AutoTokenizer.from_pretrained("RedHatAI/Sparse-Llama-3.1-8B-2of4")
+input_text = "What are we having for dinner?"
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+# auto-compile the quantized model with `cache_implementation="static"` to get speed up
+output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+</hfoption>
+</hfoptions>
+
 ### A100 GPU
 <hfoptions id="examples-A100-GPU">
 <hfoption id="int8-dynamic-and-weight-only">
@@ -215,6 +247,37 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 </hfoption>
 </hfoptions>
 
+</hfoption>
+<hfoption id="int4-weight-only-24sparse">
+
+```py
+import torch
+from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+from torchao.quantization import Int4WeightOnlyConfig
+from torchao.dtypes import MarlinSparseLayout
+
+quant_config = Int4WeightOnlyConfig(layout=MarlinSparseLayout())
+quantization_config = TorchAoConfig(quant_type=quant_config)
+
+# Load and quantize the model with sparsity. A sparse checkpoint is needed to accelerate without accuraccy loss
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    "RedHatAI/Sparse-Llama-3.1-8B-2of4",
+    torch_dtype=torch.float16,
+    device_map="cuda",
+    quantization_config=quantization_config
+)
+
+tokenizer = AutoTokenizer.from_pretrained("RedHatAI/Sparse-Llama-3.1-8B-2of4")
+input_text = "What are we having for dinner?"
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+# auto-compile the quantized model with `cache_implementation="static"` to get speed up
+output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+</hfoption>
+</hfoptions>
+
 ### CPU
 <hfoptions id="examples-CPU">
 <hfoption id="int8-dynamic-and-weight-only">