enable utils test cases on XPU (#38005)

* enable utils test cases on XPU

Signed-off-by: Yao Matrix <matrix.yao@intel.com>

* fix style

Signed-off-by: Yao Matrix <matrix.yao@intel.com>

* Update tests/utils/test_skip_decorators.py

Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>

* fix comment

Signed-off-by: Yao Matrix <matrix.yao@intel.com>

---------

Signed-off-by: Yao Matrix <matrix.yao@intel.com>
Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
This commit is contained in:
Yao Matrix 2025-05-09 14:45:01 +08:00 committed by GitHub
parent 1dfad4beb2
commit a72cb31434
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 37 additions and 28 deletions

View File

@ -28,6 +28,7 @@ from transformers.testing_utils import (
require_torch, require_torch,
require_torch_accelerator, require_torch_accelerator,
require_torch_gpu, require_torch_gpu,
require_torch_multi_accelerator,
require_torch_multi_gpu, require_torch_multi_gpu,
slow, slow,
torch_device, torch_device,
@ -355,7 +356,7 @@ class CacheHardIntegrationTest(unittest.TestCase):
self.assertIsInstance(gen_out.past_key_values, DynamicCache) # sanity check self.assertIsInstance(gen_out.past_key_values, DynamicCache) # sanity check
@parameterized.expand([("eager"), ("sdpa")]) @parameterized.expand([("eager"), ("sdpa")])
@require_torch_gpu @require_torch_accelerator
@slow @slow
def test_static_cache_greedy_decoding_pad_left(self, attn_implementation): def test_static_cache_greedy_decoding_pad_left(self, attn_implementation):
"""Tests that different cache implementations work well with eager and SDPA inference""" """Tests that different cache implementations work well with eager and SDPA inference"""
@ -436,7 +437,7 @@ class CacheHardIntegrationTest(unittest.TestCase):
offloaded_peak_memory = torch_accelerator_module.max_memory_allocated(device) offloaded_peak_memory = torch_accelerator_module.max_memory_allocated(device)
self.assertTrue(offloaded_peak_memory < original_peak_memory) self.assertTrue(offloaded_peak_memory < original_peak_memory)
@require_torch_gpu @require_torch_accelerator
@slow @slow
def test_cache_copy(self): def test_cache_copy(self):
"""Tests that we can manually set a cache, copy, and reuse it for generation""" """Tests that we can manually set a cache, copy, and reuse it for generation"""
@ -444,14 +445,14 @@ class CacheHardIntegrationTest(unittest.TestCase):
# lazy init of cache layers # lazy init of cache layers
model_name = "microsoft/Phi-3-mini-4k-instruct" model_name = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype=torch.bfloat16) model = AutoModelForCausalLM.from_pretrained(model_name, device_map=torch_device, torch_dtype=torch.bfloat16)
prompt_cache = StaticCache( prompt_cache = StaticCache(
config=model.config, max_batch_size=1, max_cache_len=1024, device="cuda", dtype=torch.bfloat16 config=model.config, max_batch_size=1, max_cache_len=1024, device=torch_device, dtype=torch.bfloat16
) )
INITIAL_PROMPT = "You are a helpful assistant. " INITIAL_PROMPT = "You are a helpful assistant. "
inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to("cuda") inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to(torch_device)
# This is the common prompt cached, we need to run forward without grad to be able to copy # This is the common prompt cached, we need to run forward without grad to be able to copy
with torch.no_grad(): with torch.no_grad():
prompt_cache = model(**inputs_initial_prompt, past_key_values=prompt_cache).past_key_values prompt_cache = model(**inputs_initial_prompt, past_key_values=prompt_cache).past_key_values
@ -459,7 +460,7 @@ class CacheHardIntegrationTest(unittest.TestCase):
prompts = ["Help me to write a blogpost about travelling.", "What is the capital of France?"] prompts = ["Help me to write a blogpost about travelling.", "What is the capital of France?"]
responses = [] responses = []
for prompt in prompts: for prompt in prompts:
new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to("cuda") new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to(torch_device)
past_key_values = copy.deepcopy(prompt_cache) past_key_values = copy.deepcopy(prompt_cache)
outputs = model.generate( outputs = model.generate(
**new_inputs, past_key_values=past_key_values, max_new_tokens=40, disable_compile=True **new_inputs, past_key_values=past_key_values, max_new_tokens=40, disable_compile=True
@ -474,6 +475,7 @@ class CacheHardIntegrationTest(unittest.TestCase):
"You are a helpful assistant. What is the capital of France?\n\n\n## Response:Paris is the capital " "You are a helpful assistant. What is the capital of France?\n\n\n## Response:Paris is the capital "
"of France.\n\n\n\n\n\n\n<|endoftext|>", "of France.\n\n\n\n\n\n\n<|endoftext|>",
] ]
self.assertEqual(responses, EXPECTED_DECODED_TEXT) self.assertEqual(responses, EXPECTED_DECODED_TEXT)
@require_torch_multi_gpu @require_torch_multi_gpu
@ -526,11 +528,11 @@ class CacheHardIntegrationTest(unittest.TestCase):
model.generate(**inputs, max_new_tokens=2, cache_implementation="static") model.generate(**inputs, max_new_tokens=2, cache_implementation="static")
self.assertNotIn("cuda", cap.err.lower()) self.assertNotIn("cuda", cap.err.lower())
@require_torch_multi_gpu @require_torch_multi_accelerator
@slow @slow
@require_read_token @require_read_token
def test_static_cache_multi_gpu(self): def test_static_cache_multi_accelerator(self):
"""Regression test for #35164: static cache with multi-gpu""" """Regression test for #35164: static cache with multi-accelerator"""
model_id = "google/gemma-2-2b-it" model_id = "google/gemma-2-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id)

View File

@ -18,7 +18,7 @@ import warnings
from parameterized import parameterized from parameterized import parameterized
from transformers import __version__, is_torch_available from transformers import __version__, is_torch_available
from transformers.testing_utils import require_torch_gpu from transformers.testing_utils import require_torch_accelerator, torch_device
from transformers.utils.deprecation import deprecate_kwarg from transformers.utils.deprecation import deprecate_kwarg
@ -174,11 +174,11 @@ class DeprecationDecoratorTester(unittest.TestCase):
result = dummy_function(deprecated_name="old_value", new_name="new_value") result = dummy_function(deprecated_name="old_value", new_name="new_value")
self.assertEqual(result, "new_value") self.assertEqual(result, "new_value")
@require_torch_gpu @require_torch_accelerator
def test_compile_safe(self): def test_compile_safe(self):
@deprecate_kwarg("deprecated_factor", new_name="new_factor", version=INFINITE_VERSION) @deprecate_kwarg("deprecated_factor", new_name="new_factor", version=INFINITE_VERSION)
def dummy_function(new_factor=None, **kwargs): def dummy_function(new_factor=None, **kwargs):
return new_factor * torch.ones(1, device="cuda") return new_factor * torch.ones(1, device=torch_device)
compiled_function = torch.compile(dummy_function, fullgraph=True) compiled_function = torch.compile(dummy_function, fullgraph=True)

View File

@ -63,7 +63,6 @@ from transformers.testing_utils import (
require_tf, require_tf,
require_torch, require_torch,
require_torch_accelerator, require_torch_accelerator,
require_torch_gpu,
require_torch_multi_accelerator, require_torch_multi_accelerator,
require_usr_bin_time, require_usr_bin_time,
slow, slow,
@ -1896,7 +1895,7 @@ class ModelUtilsTest(TestCasePlus):
@parameterized.expand([("Qwen/Qwen2.5-3B-Instruct", 10), ("meta-llama/Llama-2-7b-chat-hf", 10)]) @parameterized.expand([("Qwen/Qwen2.5-3B-Instruct", 10), ("meta-llama/Llama-2-7b-chat-hf", 10)])
@slow @slow
@require_read_token @require_read_token
@require_torch_gpu @require_torch_accelerator
def test_loading_is_fast_on_gpu(self, model_id: str, max_loading_time: float): def test_loading_is_fast_on_gpu(self, model_id: str, max_loading_time: float):
""" """
This test is used to avoid regression on https://github.com/huggingface/transformers/pull/36380. This test is used to avoid regression on https://github.com/huggingface/transformers/pull/36380.
@ -1913,27 +1912,30 @@ class ModelUtilsTest(TestCasePlus):
import time import time
import argparse import argparse
from transformers import AutoModelForCausalLM from transformers import AutoModelForCausalLM
from transformers.utils import is_torch_accelerator_available
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("model_id", type=str) parser.add_argument("model_id", type=str)
parser.add_argument("max_loading_time", type=float) parser.add_argument("max_loading_time", type=float)
args = parser.parse_args() args = parser.parse_args()
device = torch.device("cuda:0") device_type = torch.accelerator.current_accelerator().type if is_torch_accelerator_available() else "cuda"
device = torch.device(f"{device_type}:0")
torch.cuda.synchronize(device) torch_accelerator_module = getattr(torch, device_type, torch.cuda)
torch_accelerator_module.synchronize(device)
t0 = time.time() t0 = time.time()
model = AutoModelForCausalLM.from_pretrained(args.model_id, torch_dtype=torch.float16, device_map=device) model = AutoModelForCausalLM.from_pretrained(args.model_id, torch_dtype=torch.float16, device_map=device)
torch.cuda.synchronize(device) torch_accelerator_module.synchronize(device)
dt = time.time() - t0 dt = time.time() - t0
# Assert loading is faster (it should be more than enough in both cases) # Assert loading is faster (it should be more than enough in both cases)
if dt > args.max_loading_time: if dt > args.max_loading_time:
raise ValueError(f"Loading took {dt:.2f}s! It should not take more than {args.max_loading_time}s") raise ValueError(f"Loading took {dt:.2f}s! It should not take more than {args.max_loading_time}s")
# Ensure everything is correctly loaded on gpu # Ensure everything is correctly loaded on accelerator
bad_device_params = {k for k, v in model.named_parameters() if v.device != device} bad_device_params = {k for k, v in model.named_parameters() if v.device != device}
if len(bad_device_params) > 0: if len(bad_device_params) > 0:
raise ValueError(f"The following parameters are not on GPU: {bad_device_params}") raise ValueError(f"The following parameters are not on accelerator: {bad_device_params}")
""" """
) )

View File

@ -33,7 +33,7 @@ import unittest
import pytest import pytest
from parameterized import parameterized from parameterized import parameterized
from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device from transformers.testing_utils import require_torch, require_torch_accelerator, slow, torch_device
# skipping in unittest tests # skipping in unittest tests
@ -59,17 +59,22 @@ def check_slow_torch_cuda():
assert False, "should have been skipped" assert False, "should have been skipped"
def check_slow_torch_accelerator():
run_slow = bool(os.getenv("RUN_SLOW", 0))
assert run_slow and torch_device in ["cuda", "xpu"], "should have been skipped"
@require_torch @require_torch
class SkipTester(unittest.TestCase): class SkipTester(unittest.TestCase):
@slow @slow
@require_torch_gpu @require_torch_accelerator
def test_2_skips_slow_first(self): def test_2_skips_slow_first(self):
check_slow_torch_cuda() check_slow_torch_accelerator()
@require_torch_gpu @require_torch_accelerator
@slow @slow
def test_2_skips_slow_last(self): def test_2_skips_slow_last(self):
check_slow_torch_cuda() check_slow_torch_accelerator()
# The combination of any skip decorator, followed by parameterized fails to skip the tests # The combination of any skip decorator, followed by parameterized fails to skip the tests
# 1. @slow manages to correctly skip `test_param_slow_first` # 1. @slow manages to correctly skip `test_param_slow_first`
@ -96,15 +101,15 @@ class SkipTester(unittest.TestCase):
@slow @slow
@require_torch_gpu @require_torch_accelerator
def test_pytest_2_skips_slow_first(): def test_pytest_2_skips_slow_first():
check_slow_torch_cuda() check_slow_torch_accelerator()
@require_torch_gpu @require_torch_accelerator
@slow @slow
def test_pytest_2_skips_slow_last(): def test_pytest_2_skips_slow_last():
check_slow_torch_cuda() check_slow_torch_accelerator()
@slow @slow