From a72cb314347bd6b393eb4a48d597c2e57bfd5c4a Mon Sep 17 00:00:00 2001
From: Yao Matrix <matrix.yao@intel.com>
Date: Fri, 9 May 2025 14:45:01 +0800
Subject: [PATCH] enable utils test cases on XPU (#38005)

* enable utils test cases on XPU

Signed-off-by: Yao Matrix <matrix.yao@intel.com>

* fix style

Signed-off-by: Yao Matrix <matrix.yao@intel.com>

* Update tests/utils/test_skip_decorators.py

Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>

* fix comment

Signed-off-by: Yao Matrix <matrix.yao@intel.com>

---------

Signed-off-by: Yao Matrix <matrix.yao@intel.com>
Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
---
 tests/utils/test_cache_utils.py     | 20 +++++++++++---------
 tests/utils/test_deprecation.py     |  6 +++---
 tests/utils/test_modeling_utils.py  | 16 +++++++++-------
 tests/utils/test_skip_decorators.py | 23 ++++++++++++++---------
 4 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/tests/utils/test_cache_utils.py b/tests/utils/test_cache_utils.py
index 48cecb52dcb..243ae657c10 100644
--- a/tests/utils/test_cache_utils.py
+++ b/tests/utils/test_cache_utils.py
@@ -28,6 +28,7 @@ from transformers.testing_utils import (
     require_torch,
     require_torch_accelerator,
     require_torch_gpu,
+    require_torch_multi_accelerator,
     require_torch_multi_gpu,
     slow,
     torch_device,
@@ -355,7 +356,7 @@ class CacheHardIntegrationTest(unittest.TestCase):
         self.assertIsInstance(gen_out.past_key_values, DynamicCache)  # sanity check
 
     @parameterized.expand([("eager"), ("sdpa")])
-    @require_torch_gpu
+    @require_torch_accelerator
     @slow
     def test_static_cache_greedy_decoding_pad_left(self, attn_implementation):
         """Tests that different cache implementations work well with eager and SDPA inference"""
@@ -436,7 +437,7 @@ class CacheHardIntegrationTest(unittest.TestCase):
         offloaded_peak_memory = torch_accelerator_module.max_memory_allocated(device)
         self.assertTrue(offloaded_peak_memory < original_peak_memory)
 
-    @require_torch_gpu
+    @require_torch_accelerator
     @slow
     def test_cache_copy(self):
         """Tests that we can manually set a cache, copy, and reuse it for generation"""
@@ -444,14 +445,14 @@ class CacheHardIntegrationTest(unittest.TestCase):
         # lazy init of cache layers
         model_name = "microsoft/Phi-3-mini-4k-instruct"
         tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype=torch.bfloat16)
+        model = AutoModelForCausalLM.from_pretrained(model_name, device_map=torch_device, torch_dtype=torch.bfloat16)
 
         prompt_cache = StaticCache(
-            config=model.config, max_batch_size=1, max_cache_len=1024, device="cuda", dtype=torch.bfloat16
+            config=model.config, max_batch_size=1, max_cache_len=1024, device=torch_device, dtype=torch.bfloat16
         )
 
         INITIAL_PROMPT = "You are a helpful assistant. "
-        inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to("cuda")
+        inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to(torch_device)
         # This is the common prompt cached, we need to run forward without grad to be able to copy
         with torch.no_grad():
             prompt_cache = model(**inputs_initial_prompt, past_key_values=prompt_cache).past_key_values
@@ -459,7 +460,7 @@ class CacheHardIntegrationTest(unittest.TestCase):
         prompts = ["Help me to write a blogpost about travelling.", "What is the capital of France?"]
         responses = []
         for prompt in prompts:
-            new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to("cuda")
+            new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to(torch_device)
             past_key_values = copy.deepcopy(prompt_cache)
             outputs = model.generate(
                 **new_inputs, past_key_values=past_key_values, max_new_tokens=40, disable_compile=True
@@ -474,6 +475,7 @@ class CacheHardIntegrationTest(unittest.TestCase):
             "You are a helpful assistant. What is the capital of France?\n\n\n## Response:Paris is the capital "
             "of France.\n\n\n\n\n\n\n<|endoftext|>",
         ]
+
         self.assertEqual(responses, EXPECTED_DECODED_TEXT)
 
     @require_torch_multi_gpu
@@ -526,11 +528,11 @@ class CacheHardIntegrationTest(unittest.TestCase):
             model.generate(**inputs, max_new_tokens=2, cache_implementation="static")
         self.assertNotIn("cuda", cap.err.lower())
 
-    @require_torch_multi_gpu
+    @require_torch_multi_accelerator
     @slow
     @require_read_token
-    def test_static_cache_multi_gpu(self):
-        """Regression test for #35164: static cache with multi-gpu"""
+    def test_static_cache_multi_accelerator(self):
+        """Regression test for #35164: static cache with multi-accelerator"""
 
         model_id = "google/gemma-2-2b-it"
         tokenizer = AutoTokenizer.from_pretrained(model_id)
diff --git a/tests/utils/test_deprecation.py b/tests/utils/test_deprecation.py
index bf9f63e070b..81b46af37eb 100644
--- a/tests/utils/test_deprecation.py
+++ b/tests/utils/test_deprecation.py
@@ -18,7 +18,7 @@ import warnings
 from parameterized import parameterized
 
 from transformers import __version__, is_torch_available
-from transformers.testing_utils import require_torch_gpu
+from transformers.testing_utils import require_torch_accelerator, torch_device
 from transformers.utils.deprecation import deprecate_kwarg
 
 
@@ -174,11 +174,11 @@ class DeprecationDecoratorTester(unittest.TestCase):
             result = dummy_function(deprecated_name="old_value", new_name="new_value")
         self.assertEqual(result, "new_value")
 
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_compile_safe(self):
         @deprecate_kwarg("deprecated_factor", new_name="new_factor", version=INFINITE_VERSION)
         def dummy_function(new_factor=None, **kwargs):
-            return new_factor * torch.ones(1, device="cuda")
+            return new_factor * torch.ones(1, device=torch_device)
 
         compiled_function = torch.compile(dummy_function, fullgraph=True)
 
diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py
index 77d87dc3546..2df33849639 100644
--- a/tests/utils/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@@ -63,7 +63,6 @@ from transformers.testing_utils import (
     require_tf,
     require_torch,
     require_torch_accelerator,
-    require_torch_gpu,
     require_torch_multi_accelerator,
     require_usr_bin_time,
     slow,
@@ -1896,7 +1895,7 @@ class ModelUtilsTest(TestCasePlus):
     @parameterized.expand([("Qwen/Qwen2.5-3B-Instruct", 10), ("meta-llama/Llama-2-7b-chat-hf", 10)])
     @slow
     @require_read_token
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_loading_is_fast_on_gpu(self, model_id: str, max_loading_time: float):
         """
         This test is used to avoid regression on https://github.com/huggingface/transformers/pull/36380.
@@ -1913,27 +1912,30 @@ class ModelUtilsTest(TestCasePlus):
             import time
             import argparse
             from transformers import AutoModelForCausalLM
+            from transformers.utils import is_torch_accelerator_available
 
             parser = argparse.ArgumentParser()
             parser.add_argument("model_id", type=str)
             parser.add_argument("max_loading_time", type=float)
             args = parser.parse_args()
 
-            device = torch.device("cuda:0")
+            device_type = torch.accelerator.current_accelerator().type if is_torch_accelerator_available() else "cuda"
+            device = torch.device(f"{device_type}:0")
 
-            torch.cuda.synchronize(device)
+            torch_accelerator_module = getattr(torch, device_type, torch.cuda)
+            torch_accelerator_module.synchronize(device)
             t0 = time.time()
             model = AutoModelForCausalLM.from_pretrained(args.model_id, torch_dtype=torch.float16, device_map=device)
-            torch.cuda.synchronize(device)
+            torch_accelerator_module.synchronize(device)
             dt = time.time() - t0
 
             # Assert loading is faster (it should be more than enough in both cases)
             if dt > args.max_loading_time:
                 raise ValueError(f"Loading took {dt:.2f}s! It should not take more than {args.max_loading_time}s")
-            # Ensure everything is correctly loaded on gpu
+            # Ensure everything is correctly loaded on accelerator
             bad_device_params = {k for k, v in model.named_parameters() if v.device != device}
             if len(bad_device_params) > 0:
-                raise ValueError(f"The following parameters are not on GPU: {bad_device_params}")
+                raise ValueError(f"The following parameters are not on accelerator: {bad_device_params}")
             """
         )
 
diff --git a/tests/utils/test_skip_decorators.py b/tests/utils/test_skip_decorators.py
index abaefad1bf4..5ef578f0c3b 100644
--- a/tests/utils/test_skip_decorators.py
+++ b/tests/utils/test_skip_decorators.py
@@ -33,7 +33,7 @@ import unittest
 import pytest
 from parameterized import parameterized
 
-from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device
+from transformers.testing_utils import require_torch, require_torch_accelerator, slow, torch_device
 
 
 # skipping in unittest tests
@@ -59,17 +59,22 @@ def check_slow_torch_cuda():
         assert False, "should have been skipped"
 
 
+def check_slow_torch_accelerator():
+    run_slow = bool(os.getenv("RUN_SLOW", 0))
+    assert run_slow and torch_device in ["cuda", "xpu"], "should have been skipped"
+
+
 @require_torch
 class SkipTester(unittest.TestCase):
     @slow
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_2_skips_slow_first(self):
-        check_slow_torch_cuda()
+        check_slow_torch_accelerator()
 
-    @require_torch_gpu
+    @require_torch_accelerator
     @slow
     def test_2_skips_slow_last(self):
-        check_slow_torch_cuda()
+        check_slow_torch_accelerator()
 
     # The combination of any skip decorator, followed by parameterized fails to skip the tests
     # 1. @slow manages to correctly skip `test_param_slow_first`
@@ -96,15 +101,15 @@ class SkipTester(unittest.TestCase):
 
 
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 def test_pytest_2_skips_slow_first():
-    check_slow_torch_cuda()
+    check_slow_torch_accelerator()
 
 
-@require_torch_gpu
+@require_torch_accelerator
 @slow
 def test_pytest_2_skips_slow_last():
-    check_slow_torch_cuda()
+    check_slow_torch_accelerator()
 
 
 @slow