mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-20 13:08:21 +06:00
Fix aya_vision
test (#38674)
* fix 1: load_in_4bit=True, * fix 2: decorateor * fixfix 2: breakpoint * fixfix 3: update * fixfix 4: fast * fixfix 5: cond * fixfix 5: cond * fixfix 6: cuda 8 * ruff * breakpoint * dtype * a10 * a10 --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
parent
b61c47f5a5
commit
e55983e2b9
@ -27,6 +27,7 @@ from transformers import (
|
|||||||
from transformers.testing_utils import (
|
from transformers.testing_utils import (
|
||||||
Expectations,
|
Expectations,
|
||||||
cleanup,
|
cleanup,
|
||||||
|
get_device_properties,
|
||||||
require_deterministic_for_xpu,
|
require_deterministic_for_xpu,
|
||||||
require_read_token,
|
require_read_token,
|
||||||
require_torch,
|
require_torch,
|
||||||
@ -330,19 +331,39 @@ class AyaVisionModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
|
|||||||
@require_read_token
|
@require_read_token
|
||||||
@require_torch
|
@require_torch
|
||||||
class AyaVisionIntegrationTest(unittest.TestCase):
|
class AyaVisionIntegrationTest(unittest.TestCase):
|
||||||
def setUp(self):
|
@classmethod
|
||||||
self.model_checkpoint = "CohereForAI/aya-vision-8b"
|
def setUpClass(cls):
|
||||||
|
cls.model_checkpoint = "CohereForAI/aya-vision-8b"
|
||||||
|
cls.model = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def tearDownClass(cls):
|
||||||
|
del cls.model_checkpoint
|
||||||
|
cleanup(torch_device, gc_collect=True)
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
cleanup(torch_device, gc_collect=True)
|
cleanup(torch_device, gc_collect=True)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_model(cls):
|
||||||
|
# Use 4-bit on T4
|
||||||
|
load_in_4bit = get_device_properties()[0] == "cuda" and get_device_properties()[1] < 8
|
||||||
|
torch_dtype = None if load_in_4bit else torch.float16
|
||||||
|
|
||||||
|
if cls.model is None:
|
||||||
|
cls.model = AyaVisionForConditionalGeneration.from_pretrained(
|
||||||
|
cls.model_checkpoint,
|
||||||
|
device_map=torch_device,
|
||||||
|
torch_dtype=torch_dtype,
|
||||||
|
load_in_4bit=load_in_4bit,
|
||||||
|
)
|
||||||
|
return cls.model
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
@require_torch_accelerator
|
@require_torch_accelerator
|
||||||
def test_small_model_integration_forward(self):
|
def test_small_model_integration_forward(self):
|
||||||
processor = AutoProcessor.from_pretrained(self.model_checkpoint)
|
processor = AutoProcessor.from_pretrained(self.model_checkpoint)
|
||||||
model = AyaVisionForConditionalGeneration.from_pretrained(
|
model = self.get_model()
|
||||||
self.model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
|
||||||
)
|
|
||||||
messages = [
|
messages = [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
@ -361,7 +382,17 @@ class AyaVisionIntegrationTest(unittest.TestCase):
|
|||||||
output = model(**inputs)
|
output = model(**inputs)
|
||||||
|
|
||||||
actual_logits = output.logits[0, -1, :5].cpu()
|
actual_logits = output.logits[0, -1, :5].cpu()
|
||||||
expected_logits = torch.tensor([0.4109, 0.1532, 0.8018, 2.1328, 0.5483], dtype=torch.float16)
|
|
||||||
|
EXPECTED_LOGITS = Expectations(
|
||||||
|
{
|
||||||
|
("xpu", 3): [0.4109, 0.1532, 0.8018, 2.1328, 0.5483],
|
||||||
|
# 4-bit
|
||||||
|
("cuda", 7): [0.1097, 0.3481, 3.8340, 9.7969, 2.0488],
|
||||||
|
("cuda", 8): [1.6396, 0.6094, 3.1992, 8.5234, 2.1875],
|
||||||
|
}
|
||||||
|
) # fmt: skip
|
||||||
|
expected_logits = torch.tensor(EXPECTED_LOGITS.get_expectation(), dtype=torch.float16)
|
||||||
|
|
||||||
self.assertTrue(
|
self.assertTrue(
|
||||||
torch.allclose(actual_logits, expected_logits, atol=0.1),
|
torch.allclose(actual_logits, expected_logits, atol=0.1),
|
||||||
f"Actual logits: {actual_logits}"
|
f"Actual logits: {actual_logits}"
|
||||||
@ -374,9 +405,7 @@ class AyaVisionIntegrationTest(unittest.TestCase):
|
|||||||
@require_deterministic_for_xpu
|
@require_deterministic_for_xpu
|
||||||
def test_small_model_integration_generate_text_only(self):
|
def test_small_model_integration_generate_text_only(self):
|
||||||
processor = AutoProcessor.from_pretrained(self.model_checkpoint)
|
processor = AutoProcessor.from_pretrained(self.model_checkpoint)
|
||||||
model = AyaVisionForConditionalGeneration.from_pretrained(
|
model = self.get_model()
|
||||||
self.model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
|
||||||
)
|
|
||||||
messages = [
|
messages = [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
@ -398,7 +427,9 @@ class AyaVisionIntegrationTest(unittest.TestCase):
|
|||||||
expected_outputs = Expectations(
|
expected_outputs = Expectations(
|
||||||
{
|
{
|
||||||
("xpu", 3): "Whispers on the breeze,\nLeaves dance under moonlit sky,\nNature's quiet song.",
|
("xpu", 3): "Whispers on the breeze,\nLeaves dance under moonlit sky,\nNature's quiet song.",
|
||||||
("cuda", 7): "Whispers on the breeze,\nLeaves dance under moonlit skies,\nNature's quiet song.",
|
# 4-bit
|
||||||
|
("cuda", 7): "Sure, here's a haiku for you:\n\nMorning dew sparkles,\nPetals unfold in sunlight,\n",
|
||||||
|
("cuda", 8): "Whispers on the breeze,\nLeaves dance under moonlit skies,\nNature's quiet song.",
|
||||||
}
|
}
|
||||||
) # fmt: skip
|
) # fmt: skip
|
||||||
expected_output = expected_outputs.get_expectation()
|
expected_output = expected_outputs.get_expectation()
|
||||||
@ -409,9 +440,7 @@ class AyaVisionIntegrationTest(unittest.TestCase):
|
|||||||
@require_torch_accelerator
|
@require_torch_accelerator
|
||||||
def test_small_model_integration_generate_chat_template(self):
|
def test_small_model_integration_generate_chat_template(self):
|
||||||
processor = AutoProcessor.from_pretrained(self.model_checkpoint)
|
processor = AutoProcessor.from_pretrained(self.model_checkpoint)
|
||||||
model = AyaVisionForConditionalGeneration.from_pretrained(
|
model = self.get_model()
|
||||||
self.model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
|
||||||
)
|
|
||||||
messages = [
|
messages = [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
@ -430,16 +459,24 @@ class AyaVisionIntegrationTest(unittest.TestCase):
|
|||||||
decoded_output = processor.decode(
|
decoded_output = processor.decode(
|
||||||
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
|
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
|
||||||
)
|
)
|
||||||
expected_output = "The image depicts a cozy scene of two cats resting on a bright pink blanket. The cats," # fmt: skip
|
|
||||||
|
expected_outputs = Expectations(
|
||||||
|
{
|
||||||
|
("xpu", 3): "The image depicts a cozy scene of two cats resting on a bright pink blanket. The cats,",
|
||||||
|
# 4-bit
|
||||||
|
("cuda", 7): 'The image depicts two cats comfortably resting on a pink blanket spread across a sofa. The cats,',
|
||||||
|
("cuda", 8): 'The image depicts a cozy scene of two cats resting on a bright pink blanket. The cats,',
|
||||||
|
}
|
||||||
|
) # fmt: skip
|
||||||
|
expected_output = expected_outputs.get_expectation()
|
||||||
|
|
||||||
self.assertEqual(decoded_output, expected_output)
|
self.assertEqual(decoded_output, expected_output)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
@require_torch_accelerator
|
@require_torch_accelerator
|
||||||
def test_small_model_integration_batched_generate(self):
|
def test_small_model_integration_batched_generate(self):
|
||||||
processor = AutoProcessor.from_pretrained(self.model_checkpoint)
|
processor = AutoProcessor.from_pretrained(self.model_checkpoint)
|
||||||
model = AyaVisionForConditionalGeneration.from_pretrained(
|
model = self.get_model()
|
||||||
self.model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
|
||||||
)
|
|
||||||
# Prepare inputs
|
# Prepare inputs
|
||||||
messages = [
|
messages = [
|
||||||
[
|
[
|
||||||
@ -472,7 +509,9 @@ class AyaVisionIntegrationTest(unittest.TestCase):
|
|||||||
expected_outputs = Expectations(
|
expected_outputs = Expectations(
|
||||||
{
|
{
|
||||||
("xpu", 3): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.",
|
("xpu", 3): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.",
|
||||||
("cuda", 7): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene.",
|
# 4-bit
|
||||||
|
("cuda", 7): "Wooden bridge stretches\nMirrored lake below, mountains rise\nPeaceful, serene",
|
||||||
|
("cuda", 8): 'Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene.',
|
||||||
}
|
}
|
||||||
) # fmt: skip
|
) # fmt: skip
|
||||||
expected_output = expected_outputs.get_expectation()
|
expected_output = expected_outputs.get_expectation()
|
||||||
@ -485,7 +524,16 @@ class AyaVisionIntegrationTest(unittest.TestCase):
|
|||||||
|
|
||||||
# Check second output
|
# Check second output
|
||||||
decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
|
decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
|
||||||
expected_output = 'This image captures a vibrant street scene in a bustling urban area, likely in an Asian city. The focal point is a' # fmt: skip
|
|
||||||
|
expected_outputs = Expectations(
|
||||||
|
{
|
||||||
|
("xpu", 3): 'This image captures a vibrant street scene in a bustling urban area, likely in an Asian city. The focal point is a',
|
||||||
|
# 4-bit
|
||||||
|
("cuda", 7): 'This vibrant image captures a bustling street scene in a multicultural urban area, featuring a traditional Chinese gate adorned with intricate red and',
|
||||||
|
("cuda", 8): 'This image captures a vibrant street scene in a bustling urban area, likely in an Asian city. The focal point is a',
|
||||||
|
}
|
||||||
|
) # fmt: skip
|
||||||
|
expected_output = expected_outputs.get_expectation()
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
decoded_output,
|
decoded_output,
|
||||||
@ -498,9 +546,7 @@ class AyaVisionIntegrationTest(unittest.TestCase):
|
|||||||
@require_deterministic_for_xpu
|
@require_deterministic_for_xpu
|
||||||
def test_small_model_integration_batched_generate_multi_image(self):
|
def test_small_model_integration_batched_generate_multi_image(self):
|
||||||
processor = AutoProcessor.from_pretrained(self.model_checkpoint)
|
processor = AutoProcessor.from_pretrained(self.model_checkpoint)
|
||||||
model = AyaVisionForConditionalGeneration.from_pretrained(
|
model = self.get_model()
|
||||||
self.model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
|
||||||
)
|
|
||||||
# Prepare inputs
|
# Prepare inputs
|
||||||
messages = [
|
messages = [
|
||||||
[
|
[
|
||||||
@ -543,7 +589,8 @@ class AyaVisionIntegrationTest(unittest.TestCase):
|
|||||||
expected_outputs = Expectations(
|
expected_outputs = Expectations(
|
||||||
{
|
{
|
||||||
("xpu", 3): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.",
|
("xpu", 3): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.",
|
||||||
("cuda", 7): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene.",
|
("cuda", 7): 'Wooden bridge stretches\nMirrored lake below, mountains rise\nPeaceful, serene',
|
||||||
|
("cuda", 8): 'Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene.',
|
||||||
}
|
}
|
||||||
) # fmt: skip
|
) # fmt: skip
|
||||||
expected_output = expected_outputs.get_expectation()
|
expected_output = expected_outputs.get_expectation()
|
||||||
@ -559,10 +606,12 @@ class AyaVisionIntegrationTest(unittest.TestCase):
|
|||||||
expected_outputs = Expectations(
|
expected_outputs = Expectations(
|
||||||
{
|
{
|
||||||
("xpu", 3): "The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at ",
|
("xpu", 3): "The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at ",
|
||||||
("cuda", 7): "The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at a",
|
("cuda", 7): 'The first image showcases the Statue of Liberty, a monumental sculpture located on Liberty Island in New York Harbor. Standing atop a',
|
||||||
|
("cuda", 8): 'The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at ',
|
||||||
}
|
}
|
||||||
) # fmt: skip
|
) # fmt: skip
|
||||||
expected_output = expected_outputs.get_expectation()
|
expected_output = expected_outputs.get_expectation()
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
decoded_output,
|
decoded_output,
|
||||||
expected_output,
|
expected_output,
|
||||||
|
@ -17,7 +17,7 @@ import tempfile
|
|||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from transformers import AutoProcessor, AutoTokenizer, AyaVisionProcessor
|
from transformers import AutoProcessor, AutoTokenizer, AyaVisionProcessor
|
||||||
from transformers.testing_utils import require_read_token, require_torch, require_vision
|
from transformers.testing_utils import require_torch, require_vision
|
||||||
from transformers.utils import is_torch_available, is_vision_available
|
from transformers.utils import is_torch_available, is_vision_available
|
||||||
|
|
||||||
from ...test_processing_common import ProcessorTesterMixin
|
from ...test_processing_common import ProcessorTesterMixin
|
||||||
@ -31,7 +31,6 @@ if is_vision_available():
|
|||||||
from transformers import GotOcr2ImageProcessor
|
from transformers import GotOcr2ImageProcessor
|
||||||
|
|
||||||
|
|
||||||
@require_read_token
|
|
||||||
@require_vision
|
@require_vision
|
||||||
class AyaVisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
class AyaVisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||||
processor_class = AyaVisionProcessor
|
processor_class = AyaVisionProcessor
|
||||||
|
Loading…
Reference in New Issue
Block a user