From 03db2700abf84971351c7374a548a9d4fc156916 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Mon, 30 Jun 2025 22:56:55 +0800 Subject: [PATCH] Enable XPU doc (#38929) * fix example with dataset Signed-off-by: jiqing-feng * update torchao doc Signed-off-by: jiqing-feng * update torchao doc Signed-off-by: jiqing-feng * fix device type Signed-off-by: jiqing-feng * revert torchao change Signed-off-by: jiqing-feng * fix torchao doc Signed-off-by: jiqing-feng * revert torchao change Signed-off-by: jiqing-feng * update xpu torchao doc Signed-off-by: jiqing-feng * update chat_templating_multimodal.md Signed-off-by: jiqing-feng * use full name for int8 Signed-off-by: jiqing-feng * revert int8 title Signed-off-by: jiqing-feng --------- Signed-off-by: jiqing-feng Co-authored-by: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com> --- docs/source/en/chat_templating_multimodal.md | 4 +- docs/source/en/feature_extractors.md | 1 + .../source/en/quantization/finegrained_fp8.md | 2 +- docs/source/en/quantization/torchao.md | 76 +++++++++++++++++-- 4 files changed, 75 insertions(+), 8 deletions(-) diff --git a/docs/source/en/chat_templating_multimodal.md b/docs/source/en/chat_templating_multimodal.md index 3a01f652aaa..190f7317728 100644 --- a/docs/source/en/chat_templating_multimodal.md +++ b/docs/source/en/chat_templating_multimodal.md @@ -56,7 +56,7 @@ Create a [`ImageTextToTextPipeline`] and pass the chat to it. For large models, import torch from transformers import pipeline -pipeline = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda", torch_dtype=torch.float16) +pipeline = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device_map="auto", torch_dtype=torch.float16) pipeline(text=messages, max_new_tokens=50, return_full_text=False) [{'input_text': [{'role': 'system', 'content': [{'type': 'text', @@ -175,7 +175,7 @@ processed_chat = processor.apply_chat_template( add_generation_prompt=True, tokenize=True, return_dict=True, - video_fps=32, + video_fps=16, video_load_backend="decord", ) print(processed_chat.keys()) diff --git a/docs/source/en/feature_extractors.md b/docs/source/en/feature_extractors.md index 6cc20205769..38c1247909e 100644 --- a/docs/source/en/feature_extractors.md +++ b/docs/source/en/feature_extractors.md @@ -26,6 +26,7 @@ Pass the audio signal, typically stored in `array`, to the feature extractor and from transformers import AutoFeatureExtractor feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") +dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") processed_sample = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=16000) processed_sample {'input_values': [array([ 9.4472744e-05, 3.0777880e-03, -2.8888427e-03, ..., diff --git a/docs/source/en/quantization/finegrained_fp8.md b/docs/source/en/quantization/finegrained_fp8.md index 53e2a1cd3b8..c0983548363 100644 --- a/docs/source/en/quantization/finegrained_fp8.md +++ b/docs/source/en/quantization/finegrained_fp8.md @@ -47,7 +47,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=" tokenizer = AutoTokenizer.from_pretrained(model_name) input_text = "What are we having for dinner?" -input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") +input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device.type) output = quantized_model.generate(**input_ids, max_new_tokens=10) print(tokenizer.decode(output[0], skip_special_tokens=True)) diff --git a/docs/source/en/quantization/torchao.md b/docs/source/en/quantization/torchao.md index 6269294a332..c86807d57aa 100644 --- a/docs/source/en/quantization/torchao.md +++ b/docs/source/en/quantization/torchao.md @@ -49,6 +49,7 @@ Check the table below to see if your hardware is compatible. | Component | Compatibility | |----------|----------------| | CUDA Versions | ✅ cu118, cu126, cu128 | +| XPU Versions | ✅ pytorch2.8 | | CPU | ✅ change `device_map="cpu"` (see examples below) | @@ -278,6 +279,71 @@ print(tokenizer.decode(output[0], skip_special_tokens=True)) +### Intel XPU + + + +```py +import torch +from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer +from torchao.quantization import Int8DynamicActivationInt8WeightConfig, Int8WeightOnlyConfig + +quant_config = Int8DynamicActivationInt8WeightConfig() +# or int8 weight only quantization +# quant_config = Int8WeightOnlyConfig() +quantization_config = TorchAoConfig(quant_type=quant_config) + +# Load and quantize the model +quantized_model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-3.1-8B-Instruct", + torch_dtype="auto", + device_map="auto", + quantization_config=quantization_config +) + +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct") +input_text = "What are we having for dinner?" +input_ids = tokenizer(input_text, return_tensors="pt").to("xpu") + +# auto-compile the quantized model with `cache_implementation="static"` to get speed up +output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") +print(tokenizer.decode(output[0], skip_special_tokens=True)) +``` + + + + +```py +import torch +from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer +from torchao.quantization import Int4WeightOnlyConfig +from torchao.dtypes import Int4XPULayout +from torchao.quantization.quant_primitives import ZeroPointDomain + + +quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4XPULayout(), zero_point_domain=ZeroPointDomain.INT) +quantization_config = TorchAoConfig(quant_type=quant_config) + +# Load and quantize the model +quantized_model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-3.1-8B-Instruct", + torch_dtype="auto", + device_map="auto", + quantization_config=quantization_config +) + +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct") +input_text = "What are we having for dinner?" +input_ids = tokenizer(input_text, return_tensors="pt").to("xpu") + +# auto-compile the quantized model with `cache_implementation="static"` to get speed up +output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") +print(tokenizer.decode(output[0], skip_special_tokens=True)) +``` + + + + ### CPU @@ -363,7 +429,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id) # Manual Testing prompt = "Hey, are you conscious? Can you talk to me?" -inputs = tokenizer(prompt, return_tensors="pt").to("cuda") +inputs = tokenizer(prompt, return_tensors="pt").to(quantized_model.device.type) generated_ids = quantized_model.generate(**inputs, max_new_tokens=128) output_text = tokenizer.batch_decode( generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False @@ -434,7 +500,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained( tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct") input_text = "What are we having for dinner?" -input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") +input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device.type) # auto-compile the quantized model with `cache_implementation="static"` to get speed up output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") @@ -474,7 +540,7 @@ tokenizer.push_to_hub(f"{USER_ID}/llama3-8b-int4wo-128") ## Loading quantized models -Loading a quantized model depends on the quantization scheme. For quantization schemes, like int8 and float8, you can quantize the model on any device and also load it on any device. The example below demonstrates quantizing a model on the CPU and then loading it on CUDA. +Loading a quantized model depends on the quantization scheme. For quantization schemes, like int8 and float8, you can quantize the model on any device and also load it on any device. The example below demonstrates quantizing a model on the CPU and then loading it on CUDA or XPU. ```py import torch from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer @@ -491,7 +557,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained( quantization_config=quantization_config ) # save the quantized model -output_dir = "llama-3.1-8b-torchao-int8-cuda" +output_dir = "llama-3.1-8b-torchao-int8" quantized_model.save_pretrained(output_dir, safe_serialization=False) # reload the quantized model @@ -502,7 +568,7 @@ reloaded_model = AutoModelForCausalLM.from_pretrained( ) tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct") input_text = "What are we having for dinner?" -input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") +input_ids = tokenizer(input_text, return_tensors="pt").to(reloaded_model.device.type) output = reloaded_model.generate(**input_ids, max_new_tokens=10) print(tokenizer.decode(output[0], skip_special_tokens=True))