mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-03 12:50:06 +06:00
Enable XPU doc (#38929)
* fix example with dataset Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * update torchao doc Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * update torchao doc Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix device type Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * revert torchao change Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix torchao doc Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * revert torchao change Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * update xpu torchao doc Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * update chat_templating_multimodal.md Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * use full name for int8 Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * revert int8 title Signed-off-by: jiqing-feng <jiqing.feng@intel.com> --------- Signed-off-by: jiqing-feng <jiqing.feng@intel.com> Co-authored-by: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com>
This commit is contained in:
parent
ea0ea392e5
commit
03db2700ab
@ -56,7 +56,7 @@ Create a [`ImageTextToTextPipeline`] and pass the chat to it. For large models,
|
||||
import torch
|
||||
from transformers import pipeline
|
||||
|
||||
pipeline = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda", torch_dtype=torch.float16)
|
||||
pipeline = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device_map="auto", torch_dtype=torch.float16)
|
||||
pipeline(text=messages, max_new_tokens=50, return_full_text=False)
|
||||
[{'input_text': [{'role': 'system',
|
||||
'content': [{'type': 'text',
|
||||
@ -175,7 +175,7 @@ processed_chat = processor.apply_chat_template(
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
video_fps=32,
|
||||
video_fps=16,
|
||||
video_load_backend="decord",
|
||||
)
|
||||
print(processed_chat.keys())
|
||||
|
@ -26,6 +26,7 @@ Pass the audio signal, typically stored in `array`, to the feature extractor and
|
||||
from transformers import AutoFeatureExtractor
|
||||
|
||||
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
|
||||
dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
|
||||
processed_sample = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=16000)
|
||||
processed_sample
|
||||
{'input_values': [array([ 9.4472744e-05, 3.0777880e-03, -2.8888427e-03, ...,
|
||||
|
@ -47,7 +47,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
input_text = "What are we having for dinner?"
|
||||
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
|
||||
input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device.type)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=10)
|
||||
print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
|
@ -49,6 +49,7 @@ Check the table below to see if your hardware is compatible.
|
||||
| Component | Compatibility |
|
||||
|----------|----------------|
|
||||
| CUDA Versions | ✅ cu118, cu126, cu128 |
|
||||
| XPU Versions | ✅ pytorch2.8 |
|
||||
| CPU | ✅ change `device_map="cpu"` (see examples below) |
|
||||
|
||||
|
||||
@ -278,6 +279,71 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
### Intel XPU
|
||||
<hfoptions id="examples-Intel-XPU">
|
||||
<hfoption id="int8-dynamic-and-weight-only">
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
|
||||
from torchao.quantization import Int8DynamicActivationInt8WeightConfig, Int8WeightOnlyConfig
|
||||
|
||||
quant_config = Int8DynamicActivationInt8WeightConfig()
|
||||
# or int8 weight only quantization
|
||||
# quant_config = Int8WeightOnlyConfig()
|
||||
quantization_config = TorchAoConfig(quant_type=quant_config)
|
||||
|
||||
# Load and quantize the model
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-3.1-8B-Instruct",
|
||||
torch_dtype="auto",
|
||||
device_map="auto",
|
||||
quantization_config=quantization_config
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
|
||||
input_text = "What are we having for dinner?"
|
||||
input_ids = tokenizer(input_text, return_tensors="pt").to("xpu")
|
||||
|
||||
# auto-compile the quantized model with `cache_implementation="static"` to get speed up
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
|
||||
print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
```
|
||||
</hfoption>
|
||||
|
||||
<hfoption id="int4-weight-only">
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
|
||||
from torchao.quantization import Int4WeightOnlyConfig
|
||||
from torchao.dtypes import Int4XPULayout
|
||||
from torchao.quantization.quant_primitives import ZeroPointDomain
|
||||
|
||||
|
||||
quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4XPULayout(), zero_point_domain=ZeroPointDomain.INT)
|
||||
quantization_config = TorchAoConfig(quant_type=quant_config)
|
||||
|
||||
# Load and quantize the model
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-3.1-8B-Instruct",
|
||||
torch_dtype="auto",
|
||||
device_map="auto",
|
||||
quantization_config=quantization_config
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
|
||||
input_text = "What are we having for dinner?"
|
||||
input_ids = tokenizer(input_text, return_tensors="pt").to("xpu")
|
||||
|
||||
# auto-compile the quantized model with `cache_implementation="static"` to get speed up
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
|
||||
print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
```
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
|
||||
### CPU
|
||||
<hfoptions id="examples-CPU">
|
||||
<hfoption id="int8-dynamic-and-weight-only">
|
||||
@ -363,7 +429,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
|
||||
# Manual Testing
|
||||
prompt = "Hey, are you conscious? Can you talk to me?"
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to(quantized_model.device.type)
|
||||
generated_ids = quantized_model.generate(**inputs, max_new_tokens=128)
|
||||
output_text = tokenizer.batch_decode(
|
||||
generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
||||
@ -434,7 +500,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
|
||||
input_text = "What are we having for dinner?"
|
||||
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
|
||||
input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device.type)
|
||||
|
||||
# auto-compile the quantized model with `cache_implementation="static"` to get speed up
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
|
||||
@ -474,7 +540,7 @@ tokenizer.push_to_hub(f"{USER_ID}/llama3-8b-int4wo-128")
|
||||
|
||||
## Loading quantized models
|
||||
|
||||
Loading a quantized model depends on the quantization scheme. For quantization schemes, like int8 and float8, you can quantize the model on any device and also load it on any device. The example below demonstrates quantizing a model on the CPU and then loading it on CUDA.
|
||||
Loading a quantized model depends on the quantization scheme. For quantization schemes, like int8 and float8, you can quantize the model on any device and also load it on any device. The example below demonstrates quantizing a model on the CPU and then loading it on CUDA or XPU.
|
||||
```py
|
||||
import torch
|
||||
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
|
||||
@ -491,7 +557,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
quantization_config=quantization_config
|
||||
)
|
||||
# save the quantized model
|
||||
output_dir = "llama-3.1-8b-torchao-int8-cuda"
|
||||
output_dir = "llama-3.1-8b-torchao-int8"
|
||||
quantized_model.save_pretrained(output_dir, safe_serialization=False)
|
||||
|
||||
# reload the quantized model
|
||||
@ -502,7 +568,7 @@ reloaded_model = AutoModelForCausalLM.from_pretrained(
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
|
||||
input_text = "What are we having for dinner?"
|
||||
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
|
||||
input_ids = tokenizer(input_text, return_tensors="pt").to(reloaded_model.device.type)
|
||||
|
||||
output = reloaded_model.generate(**input_ids, max_new_tokens=10)
|
||||
print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
|
Loading…
Reference in New Issue
Block a user