mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 18:22:34 +06:00
enable 6 modeling cases on XPU (#37571)
Signed-off-by: YAO Matrix <matrix.yao@intel.com>
This commit is contained in:
parent
3cd6627cd7
commit
a1b82563f1
@ -19,7 +19,14 @@ import unittest
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from transformers import AutoTokenizer, BambaConfig, is_torch_available
|
from transformers import AutoTokenizer, BambaConfig, is_torch_available
|
||||||
from transformers.testing_utils import Expectations, require_torch, require_torch_gpu, slow, torch_device
|
from transformers.testing_utils import (
|
||||||
|
Expectations,
|
||||||
|
require_deterministic_for_xpu,
|
||||||
|
require_torch,
|
||||||
|
require_torch_accelerator,
|
||||||
|
slow,
|
||||||
|
torch_device,
|
||||||
|
)
|
||||||
|
|
||||||
from ...generation.test_utils import GenerationTesterMixin
|
from ...generation.test_utils import GenerationTesterMixin
|
||||||
from ...test_configuration_common import ConfigTester
|
from ...test_configuration_common import ConfigTester
|
||||||
@ -474,7 +481,7 @@ class BambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
|
|||||||
|
|
||||||
@slow
|
@slow
|
||||||
@require_torch
|
@require_torch
|
||||||
@require_torch_gpu
|
@require_torch_accelerator
|
||||||
class BambaModelIntegrationTest(unittest.TestCase):
|
class BambaModelIntegrationTest(unittest.TestCase):
|
||||||
model = None
|
model = None
|
||||||
tokenizer = None
|
tokenizer = None
|
||||||
@ -507,6 +514,10 @@ class BambaModelIntegrationTest(unittest.TestCase):
|
|||||||
"rocm",
|
"rocm",
|
||||||
9,
|
9,
|
||||||
): "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are doing well. I am here",
|
): "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are doing well. I am here",
|
||||||
|
(
|
||||||
|
"xpu",
|
||||||
|
3,
|
||||||
|
): "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are all doing well. Today I",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -536,22 +547,30 @@ class BambaModelIntegrationTest(unittest.TestCase):
|
|||||||
|
|
||||||
torch.testing.assert_close(logits[0, -1, :40].cpu(), EXPECTED_LOGITS_NO_GRAD, rtol=1e-3, atol=1)
|
torch.testing.assert_close(logits[0, -1, :40].cpu(), EXPECTED_LOGITS_NO_GRAD, rtol=1e-3, atol=1)
|
||||||
|
|
||||||
|
@require_deterministic_for_xpu
|
||||||
def test_simple_batched_generate_with_padding(self):
|
def test_simple_batched_generate_with_padding(self):
|
||||||
# Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
|
# Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
|
||||||
#
|
#
|
||||||
# Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
|
# Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
|
||||||
# considering differences in hardware processing and potential deviations in generated text.
|
# considering differences in hardware processing and potential deviations in generated text.
|
||||||
EXPECTED_TEXTS = {
|
EXPECTED_TEXTS = Expectations(
|
||||||
7: [],
|
{
|
||||||
8: [
|
("cuda", 7): [],
|
||||||
"<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are doing well. I am here",
|
("cuda", 8): [
|
||||||
"!!!<|begin_of_text|>I am late! I need to get to work! I have to get to the",
|
"<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are doing well. I am here",
|
||||||
],
|
"!!!<|begin_of_text|>I am late! I need to get to work! I have to get to the",
|
||||||
9: [
|
],
|
||||||
"<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are doing well. I am here",
|
("rocm", 9): [
|
||||||
"!!!<|begin_of_text|>I am late! I need to be at the airport in 20 minutes! I",
|
"<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are doing well. I am here",
|
||||||
],
|
"!!!<|begin_of_text|>I am late! I need to be at the airport in 20 minutes! I",
|
||||||
}
|
],
|
||||||
|
("xpu", 3): [
|
||||||
|
"<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are all doing well. Today I",
|
||||||
|
"!!!<|begin_of_text|>I am late! I need to get to work! I have to get to the",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
|
||||||
|
|
||||||
self.model.to(torch_device)
|
self.model.to(torch_device)
|
||||||
|
|
||||||
@ -562,8 +581,8 @@ class BambaModelIntegrationTest(unittest.TestCase):
|
|||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
out = self.model.generate(**inputs, do_sample=False, max_new_tokens=10)
|
out = self.model.generate(**inputs, do_sample=False, max_new_tokens=10)
|
||||||
output_sentences = self.tokenizer.batch_decode(out)
|
output_sentences = self.tokenizer.batch_decode(out)
|
||||||
self.assertEqual(output_sentences[0], EXPECTED_TEXTS[self.cuda_compute_capability_major_version][0])
|
self.assertEqual(output_sentences[0], EXPECTED_TEXT[0])
|
||||||
self.assertEqual(output_sentences[1], EXPECTED_TEXTS[self.cuda_compute_capability_major_version][1])
|
self.assertEqual(output_sentences[1], EXPECTED_TEXT[1])
|
||||||
|
|
||||||
# TODO: there are significant differences in the logits across major cuda versions, which shouldn't exist
|
# TODO: there are significant differences in the logits across major cuda versions, which shouldn't exist
|
||||||
if self.cuda_compute_capability_major_version == 8:
|
if self.cuda_compute_capability_major_version == 8:
|
||||||
|
@ -643,7 +643,7 @@ class GemmaIntegrationTest(unittest.TestCase):
|
|||||||
self.assertEqual(output_text, EXPECTED_TEXTS)
|
self.assertEqual(output_text, EXPECTED_TEXTS)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
@require_torch_gpu
|
@require_torch_accelerator
|
||||||
@require_read_token
|
@require_read_token
|
||||||
def test_compile_static_cache(self):
|
def test_compile_static_cache(self):
|
||||||
# `torch==2.2` will throw an error on this test (as in other compilation tests), but torch==2.1.2 and torch>2.2
|
# `torch==2.2` will throw an error on this test (as in other compilation tests), but torch==2.1.2 and torch>2.2
|
||||||
|
@ -520,7 +520,13 @@ class MptIntegrationTests(unittest.TestCase):
|
|||||||
|
|
||||||
outputs = model(dummy_input, output_hidden_states=True)
|
outputs = model(dummy_input, output_hidden_states=True)
|
||||||
|
|
||||||
expected_slice = torch.Tensor([-0.2520, -0.2178, -0.1953]).to(torch_device, torch.bfloat16)
|
expected_slices = Expectations(
|
||||||
|
{
|
||||||
|
("xpu", 3): torch.Tensor([-0.2090, -0.2061, -0.1465]),
|
||||||
|
("cuda", 7): torch.Tensor([-0.2520, -0.2178, -0.1953]),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
expected_slice = expected_slices.get_expectation().to(torch_device, torch.bfloat16)
|
||||||
predicted_slice = outputs.hidden_states[-1][0, 0, :3]
|
predicted_slice = outputs.hidden_states[-1][0, 0, :3]
|
||||||
|
|
||||||
torch.testing.assert_close(expected_slice, predicted_slice, rtol=1e-3, atol=1e-3)
|
torch.testing.assert_close(expected_slice, predicted_slice, rtol=1e-3, atol=1e-3)
|
||||||
|
@ -21,6 +21,7 @@ import pytest
|
|||||||
|
|
||||||
from transformers import NemotronConfig, is_torch_available
|
from transformers import NemotronConfig, is_torch_available
|
||||||
from transformers.testing_utils import (
|
from transformers.testing_utils import (
|
||||||
|
Expectations,
|
||||||
is_flaky,
|
is_flaky,
|
||||||
require_flash_attn,
|
require_flash_attn,
|
||||||
require_read_token,
|
require_read_token,
|
||||||
@ -168,7 +169,7 @@ class NemotronModelTest(GemmaModelTest):
|
|||||||
assert torch.allclose(logits_fa, logits, atol=1e-2)
|
assert torch.allclose(logits_fa, logits, atol=1e-2)
|
||||||
|
|
||||||
|
|
||||||
@require_torch_gpu
|
@require_torch_accelerator
|
||||||
class NemotronIntegrationTest(unittest.TestCase):
|
class NemotronIntegrationTest(unittest.TestCase):
|
||||||
# This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
|
# This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
|
||||||
# Depending on the hardware we get different logits / generations
|
# Depending on the hardware we get different logits / generations
|
||||||
@ -202,9 +203,17 @@ class NemotronIntegrationTest(unittest.TestCase):
|
|||||||
@require_read_token
|
@require_read_token
|
||||||
def test_nemotron_8b_generation_eager(self):
|
def test_nemotron_8b_generation_eager(self):
|
||||||
text = ["What is the largest planet in solar system?"]
|
text = ["What is the largest planet in solar system?"]
|
||||||
EXPECTED_TEXT = [
|
EXPECTED_TEXTS = Expectations(
|
||||||
"What is the largest planet in solar system?\nAnswer: Jupiter\n\nWhat is the answer",
|
{
|
||||||
]
|
("xpu", 3): [
|
||||||
|
"What is the largest planet in solar system?\nAnswer: Jupiter\n\nWhat is the answer: What is the name of the 19",
|
||||||
|
],
|
||||||
|
("cuda", 7): [
|
||||||
|
"What is the largest planet in solar system?\nAnswer: Jupiter\n\nWhat is the answer",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
|
||||||
model_id = "thhaus/nemotron3-8b"
|
model_id = "thhaus/nemotron3-8b"
|
||||||
model = NemotronForCausalLM.from_pretrained(
|
model = NemotronForCausalLM.from_pretrained(
|
||||||
model_id, torch_dtype=torch.float16, device_map="auto", attn_implementation="eager"
|
model_id, torch_dtype=torch.float16, device_map="auto", attn_implementation="eager"
|
||||||
|
Loading…
Reference in New Issue
Block a user