From a72cb314347bd6b393eb4a48d597c2e57bfd5c4a Mon Sep 17 00:00:00 2001 From: Yao Matrix Date: Fri, 9 May 2025 14:45:01 +0800 Subject: [PATCH] enable utils test cases on XPU (#38005) * enable utils test cases on XPU Signed-off-by: Yao Matrix * fix style Signed-off-by: Yao Matrix * Update tests/utils/test_skip_decorators.py Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> * fix comment Signed-off-by: Yao Matrix --------- Signed-off-by: Yao Matrix Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> --- tests/utils/test_cache_utils.py | 20 +++++++++++--------- tests/utils/test_deprecation.py | 6 +++--- tests/utils/test_modeling_utils.py | 16 +++++++++------- tests/utils/test_skip_decorators.py | 23 ++++++++++++++--------- 4 files changed, 37 insertions(+), 28 deletions(-) diff --git a/tests/utils/test_cache_utils.py b/tests/utils/test_cache_utils.py index 48cecb52dcb..243ae657c10 100644 --- a/tests/utils/test_cache_utils.py +++ b/tests/utils/test_cache_utils.py @@ -28,6 +28,7 @@ from transformers.testing_utils import ( require_torch, require_torch_accelerator, require_torch_gpu, + require_torch_multi_accelerator, require_torch_multi_gpu, slow, torch_device, @@ -355,7 +356,7 @@ class CacheHardIntegrationTest(unittest.TestCase): self.assertIsInstance(gen_out.past_key_values, DynamicCache) # sanity check @parameterized.expand([("eager"), ("sdpa")]) - @require_torch_gpu + @require_torch_accelerator @slow def test_static_cache_greedy_decoding_pad_left(self, attn_implementation): """Tests that different cache implementations work well with eager and SDPA inference""" @@ -436,7 +437,7 @@ class CacheHardIntegrationTest(unittest.TestCase): offloaded_peak_memory = torch_accelerator_module.max_memory_allocated(device) self.assertTrue(offloaded_peak_memory < original_peak_memory) - @require_torch_gpu + @require_torch_accelerator @slow def test_cache_copy(self): """Tests that we can manually set a cache, copy, and reuse it for generation""" @@ -444,14 +445,14 @@ class CacheHardIntegrationTest(unittest.TestCase): # lazy init of cache layers model_name = "microsoft/Phi-3-mini-4k-instruct" tokenizer = AutoTokenizer.from_pretrained(model_name) - model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype=torch.bfloat16) + model = AutoModelForCausalLM.from_pretrained(model_name, device_map=torch_device, torch_dtype=torch.bfloat16) prompt_cache = StaticCache( - config=model.config, max_batch_size=1, max_cache_len=1024, device="cuda", dtype=torch.bfloat16 + config=model.config, max_batch_size=1, max_cache_len=1024, device=torch_device, dtype=torch.bfloat16 ) INITIAL_PROMPT = "You are a helpful assistant. " - inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to("cuda") + inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to(torch_device) # This is the common prompt cached, we need to run forward without grad to be able to copy with torch.no_grad(): prompt_cache = model(**inputs_initial_prompt, past_key_values=prompt_cache).past_key_values @@ -459,7 +460,7 @@ class CacheHardIntegrationTest(unittest.TestCase): prompts = ["Help me to write a blogpost about travelling.", "What is the capital of France?"] responses = [] for prompt in prompts: - new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to("cuda") + new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to(torch_device) past_key_values = copy.deepcopy(prompt_cache) outputs = model.generate( **new_inputs, past_key_values=past_key_values, max_new_tokens=40, disable_compile=True @@ -474,6 +475,7 @@ class CacheHardIntegrationTest(unittest.TestCase): "You are a helpful assistant. What is the capital of France?\n\n\n## Response:Paris is the capital " "of France.\n\n\n\n\n\n\n<|endoftext|>", ] + self.assertEqual(responses, EXPECTED_DECODED_TEXT) @require_torch_multi_gpu @@ -526,11 +528,11 @@ class CacheHardIntegrationTest(unittest.TestCase): model.generate(**inputs, max_new_tokens=2, cache_implementation="static") self.assertNotIn("cuda", cap.err.lower()) - @require_torch_multi_gpu + @require_torch_multi_accelerator @slow @require_read_token - def test_static_cache_multi_gpu(self): - """Regression test for #35164: static cache with multi-gpu""" + def test_static_cache_multi_accelerator(self): + """Regression test for #35164: static cache with multi-accelerator""" model_id = "google/gemma-2-2b-it" tokenizer = AutoTokenizer.from_pretrained(model_id) diff --git a/tests/utils/test_deprecation.py b/tests/utils/test_deprecation.py index bf9f63e070b..81b46af37eb 100644 --- a/tests/utils/test_deprecation.py +++ b/tests/utils/test_deprecation.py @@ -18,7 +18,7 @@ import warnings from parameterized import parameterized from transformers import __version__, is_torch_available -from transformers.testing_utils import require_torch_gpu +from transformers.testing_utils import require_torch_accelerator, torch_device from transformers.utils.deprecation import deprecate_kwarg @@ -174,11 +174,11 @@ class DeprecationDecoratorTester(unittest.TestCase): result = dummy_function(deprecated_name="old_value", new_name="new_value") self.assertEqual(result, "new_value") - @require_torch_gpu + @require_torch_accelerator def test_compile_safe(self): @deprecate_kwarg("deprecated_factor", new_name="new_factor", version=INFINITE_VERSION) def dummy_function(new_factor=None, **kwargs): - return new_factor * torch.ones(1, device="cuda") + return new_factor * torch.ones(1, device=torch_device) compiled_function = torch.compile(dummy_function, fullgraph=True) diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py index 77d87dc3546..2df33849639 100644 --- a/tests/utils/test_modeling_utils.py +++ b/tests/utils/test_modeling_utils.py @@ -63,7 +63,6 @@ from transformers.testing_utils import ( require_tf, require_torch, require_torch_accelerator, - require_torch_gpu, require_torch_multi_accelerator, require_usr_bin_time, slow, @@ -1896,7 +1895,7 @@ class ModelUtilsTest(TestCasePlus): @parameterized.expand([("Qwen/Qwen2.5-3B-Instruct", 10), ("meta-llama/Llama-2-7b-chat-hf", 10)]) @slow @require_read_token - @require_torch_gpu + @require_torch_accelerator def test_loading_is_fast_on_gpu(self, model_id: str, max_loading_time: float): """ This test is used to avoid regression on https://github.com/huggingface/transformers/pull/36380. @@ -1913,27 +1912,30 @@ class ModelUtilsTest(TestCasePlus): import time import argparse from transformers import AutoModelForCausalLM + from transformers.utils import is_torch_accelerator_available parser = argparse.ArgumentParser() parser.add_argument("model_id", type=str) parser.add_argument("max_loading_time", type=float) args = parser.parse_args() - device = torch.device("cuda:0") + device_type = torch.accelerator.current_accelerator().type if is_torch_accelerator_available() else "cuda" + device = torch.device(f"{device_type}:0") - torch.cuda.synchronize(device) + torch_accelerator_module = getattr(torch, device_type, torch.cuda) + torch_accelerator_module.synchronize(device) t0 = time.time() model = AutoModelForCausalLM.from_pretrained(args.model_id, torch_dtype=torch.float16, device_map=device) - torch.cuda.synchronize(device) + torch_accelerator_module.synchronize(device) dt = time.time() - t0 # Assert loading is faster (it should be more than enough in both cases) if dt > args.max_loading_time: raise ValueError(f"Loading took {dt:.2f}s! It should not take more than {args.max_loading_time}s") - # Ensure everything is correctly loaded on gpu + # Ensure everything is correctly loaded on accelerator bad_device_params = {k for k, v in model.named_parameters() if v.device != device} if len(bad_device_params) > 0: - raise ValueError(f"The following parameters are not on GPU: {bad_device_params}") + raise ValueError(f"The following parameters are not on accelerator: {bad_device_params}") """ ) diff --git a/tests/utils/test_skip_decorators.py b/tests/utils/test_skip_decorators.py index abaefad1bf4..5ef578f0c3b 100644 --- a/tests/utils/test_skip_decorators.py +++ b/tests/utils/test_skip_decorators.py @@ -33,7 +33,7 @@ import unittest import pytest from parameterized import parameterized -from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device +from transformers.testing_utils import require_torch, require_torch_accelerator, slow, torch_device # skipping in unittest tests @@ -59,17 +59,22 @@ def check_slow_torch_cuda(): assert False, "should have been skipped" +def check_slow_torch_accelerator(): + run_slow = bool(os.getenv("RUN_SLOW", 0)) + assert run_slow and torch_device in ["cuda", "xpu"], "should have been skipped" + + @require_torch class SkipTester(unittest.TestCase): @slow - @require_torch_gpu + @require_torch_accelerator def test_2_skips_slow_first(self): - check_slow_torch_cuda() + check_slow_torch_accelerator() - @require_torch_gpu + @require_torch_accelerator @slow def test_2_skips_slow_last(self): - check_slow_torch_cuda() + check_slow_torch_accelerator() # The combination of any skip decorator, followed by parameterized fails to skip the tests # 1. @slow manages to correctly skip `test_param_slow_first` @@ -96,15 +101,15 @@ class SkipTester(unittest.TestCase): @slow -@require_torch_gpu +@require_torch_accelerator def test_pytest_2_skips_slow_first(): - check_slow_torch_cuda() + check_slow_torch_accelerator() -@require_torch_gpu +@require_torch_accelerator @slow def test_pytest_2_skips_slow_last(): - check_slow_torch_cuda() + check_slow_torch_accelerator() @slow