mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-04 05:10:06 +06:00
Small typo lines 47 and 199 perf_infer_gpu_one.md (#37938)
* Small typo line 199 perf_infer_gpu_one.md * Typo l. 47 perf_infer_gpu_one.md
This commit is contained in:
parent
cc68070d41
commit
057ae00504
@ -44,7 +44,7 @@ Place all inputs on the same device as the model.
|
|||||||
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
|
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
|
||||||
|
|
||||||
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
|
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
|
||||||
tokenizer = AutoTokenizer("meta-llama/Llama-3.1-8B")
|
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
|
||||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", quantization_config=quantization_config)
|
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", quantization_config=quantization_config)
|
||||||
|
|
||||||
prompt = "Hello, my llama is cute"
|
prompt = "Hello, my llama is cute"
|
||||||
@ -196,7 +196,7 @@ model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_m
|
|||||||
input_text = "Hello, my llama is cute"
|
input_text = "Hello, my llama is cute"
|
||||||
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
|
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
|
||||||
|
|
||||||
with sdpa_kernel(SDPBackend.FLASH_ATTENTION)::
|
with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
|
||||||
outputs = model.generate(**inputs)
|
outputs = model.generate(**inputs)
|
||||||
|
|
||||||
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||||||
|
Loading…
Reference in New Issue
Block a user