Enable XPU doc (#38929)

* fix example with dataset

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* update torchao doc

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* update torchao doc

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix device type

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* revert torchao change

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix torchao doc

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* revert torchao change

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* update xpu torchao doc

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* update chat_templating_multimodal.md

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* use full name for int8

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* revert int8 title

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

---------

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
Co-authored-by: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com>
This commit is contained in:
jiqing-feng 2025-06-30 22:56:55 +08:00 committed by GitHub
parent ea0ea392e5
commit 03db2700ab
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 75 additions and 8 deletions

View File

@ -56,7 +56,7 @@ Create a [`ImageTextToTextPipeline`] and pass the chat to it. For large models,
import torch
from transformers import pipeline
pipeline = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda", torch_dtype=torch.float16)
pipeline = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device_map="auto", torch_dtype=torch.float16)
pipeline(text=messages, max_new_tokens=50, return_full_text=False)
[{'input_text': [{'role': 'system',
'content': [{'type': 'text',
@ -175,7 +175,7 @@ processed_chat = processor.apply_chat_template(
add_generation_prompt=True,
tokenize=True,
return_dict=True,
video_fps=32,
video_fps=16,
video_load_backend="decord",
)
print(processed_chat.keys())

View File

@ -26,6 +26,7 @@ Pass the audio signal, typically stored in `array`, to the feature extractor and
from transformers import AutoFeatureExtractor
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
processed_sample = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=16000)
processed_sample
{'input_values': [array([ 9.4472744e-05, 3.0777880e-03, -2.8888427e-03, ...,

View File

@ -47,7 +47,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="
tokenizer = AutoTokenizer.from_pretrained(model_name)
input_text = "What are we having for dinner?"
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device.type)
output = quantized_model.generate(**input_ids, max_new_tokens=10)
print(tokenizer.decode(output[0], skip_special_tokens=True))

View File

@ -49,6 +49,7 @@ Check the table below to see if your hardware is compatible.
| Component | Compatibility |
|----------|----------------|
| CUDA Versions | ✅ cu118, cu126, cu128 |
| XPU Versions | ✅ pytorch2.8 |
| CPU | ✅ change `device_map="cpu"` (see examples below) |
@ -278,6 +279,71 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
</hfoption>
</hfoptions>
### Intel XPU
<hfoptions id="examples-Intel-XPU">
<hfoption id="int8-dynamic-and-weight-only">
```py
import torch
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
from torchao.quantization import Int8DynamicActivationInt8WeightConfig, Int8WeightOnlyConfig
quant_config = Int8DynamicActivationInt8WeightConfig()
# or int8 weight only quantization
# quant_config = Int8WeightOnlyConfig()
quantization_config = TorchAoConfig(quant_type=quant_config)
# Load and quantize the model
quantized_model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-8B-Instruct",
torch_dtype="auto",
device_map="auto",
quantization_config=quantization_config
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
input_text = "What are we having for dinner?"
input_ids = tokenizer(input_text, return_tensors="pt").to("xpu")
# auto-compile the quantized model with `cache_implementation="static"` to get speed up
output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
print(tokenizer.decode(output[0], skip_special_tokens=True))
```
</hfoption>
<hfoption id="int4-weight-only">
```py
import torch
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
from torchao.quantization import Int4WeightOnlyConfig
from torchao.dtypes import Int4XPULayout
from torchao.quantization.quant_primitives import ZeroPointDomain
quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4XPULayout(), zero_point_domain=ZeroPointDomain.INT)
quantization_config = TorchAoConfig(quant_type=quant_config)
# Load and quantize the model
quantized_model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-8B-Instruct",
torch_dtype="auto",
device_map="auto",
quantization_config=quantization_config
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
input_text = "What are we having for dinner?"
input_ids = tokenizer(input_text, return_tensors="pt").to("xpu")
# auto-compile the quantized model with `cache_implementation="static"` to get speed up
output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
print(tokenizer.decode(output[0], skip_special_tokens=True))
```
</hfoption>
</hfoptions>
### CPU
<hfoptions id="examples-CPU">
<hfoption id="int8-dynamic-and-weight-only">
@ -363,7 +429,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
# Manual Testing
prompt = "Hey, are you conscious? Can you talk to me?"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
inputs = tokenizer(prompt, return_tensors="pt").to(quantized_model.device.type)
generated_ids = quantized_model.generate(**inputs, max_new_tokens=128)
output_text = tokenizer.batch_decode(
generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
@ -434,7 +500,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
input_text = "What are we having for dinner?"
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device.type)
# auto-compile the quantized model with `cache_implementation="static"` to get speed up
output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
@ -474,7 +540,7 @@ tokenizer.push_to_hub(f"{USER_ID}/llama3-8b-int4wo-128")
## Loading quantized models
Loading a quantized model depends on the quantization scheme. For quantization schemes, like int8 and float8, you can quantize the model on any device and also load it on any device. The example below demonstrates quantizing a model on the CPU and then loading it on CUDA.
Loading a quantized model depends on the quantization scheme. For quantization schemes, like int8 and float8, you can quantize the model on any device and also load it on any device. The example below demonstrates quantizing a model on the CPU and then loading it on CUDA or XPU.
```py
import torch
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
@ -491,7 +557,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(
quantization_config=quantization_config
)
# save the quantized model
output_dir = "llama-3.1-8b-torchao-int8-cuda"
output_dir = "llama-3.1-8b-torchao-int8"
quantized_model.save_pretrained(output_dir, safe_serialization=False)
# reload the quantized model
@ -502,7 +568,7 @@ reloaded_model = AutoModelForCausalLM.from_pretrained(
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
input_text = "What are we having for dinner?"
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
input_ids = tokenizer(input_text, return_tensors="pt").to(reloaded_model.device.type)
output = reloaded_model.generate(**input_ids, max_new_tokens=10)
print(tokenizer.decode(output[0], skip_special_tokens=True))