From 057ae00504ce6daad1a29baa0451b21c1c1b94b3 Mon Sep 17 00:00:00 2001 From: nlhm <61977480+nlhmnlhmnlhm@users.noreply.github.com> Date: Tue, 6 May 2025 15:32:55 +0200 Subject: [PATCH] Small typo lines 47 and 199 perf_infer_gpu_one.md (#37938) * Small typo line 199 perf_infer_gpu_one.md * Typo l. 47 perf_infer_gpu_one.md --- docs/source/en/perf_infer_gpu_one.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 35297d332ca..c3a7ddc8d8a 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -44,7 +44,7 @@ Place all inputs on the same device as the model. from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM quantization_config = BitsAndBytesConfig(load_in_8bit=True) -tokenizer = AutoTokenizer("meta-llama/Llama-3.1-8B") +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B") model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", quantization_config=quantization_config) prompt = "Hello, my llama is cute" @@ -196,7 +196,7 @@ model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_m input_text = "Hello, my llama is cute" inputs = tokenizer(input_text, return_tensors="pt").to("cuda") -with sdpa_kernel(SDPBackend.FLASH_ATTENTION):: +with sdpa_kernel(SDPBackend.FLASH_ATTENTION): outputs = model.generate(**inputs) print(tokenizer.decode(outputs[0], skip_special_tokens=True))