diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py index 5d60f6248b1..a4457a45f42 100644 --- a/tests/models/llava/test_modeling_llava.py +++ b/tests/models/llava/test_modeling_llava.py @@ -29,6 +29,7 @@ from transformers import ( is_vision_available, ) from transformers.testing_utils import ( + Expectations, cleanup, require_bitsandbytes, require_torch, @@ -362,7 +363,14 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase): inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16) output = model.generate(**inputs, max_new_tokens=900, do_sample=False) - EXPECTED_DECODED_TEXT = "USER: \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Finally, be respectful of the environment and other visitors, and follow any posted rules or guidelines for the area." # fmt: skip + + EXPECTED_DECODED_TEXTS = Expectations( + { + ("cuda", 7): 'USER: \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Lastly, be respectful of the environment and other visitors, as the pier is a shared space where people can enjoy the view, relax, or engage in recreational activities.', + ("cuda", 8): 'USER: \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Lastly, be respectful of the environment and other visitors, as the pier is a shared space where people can enjoy the view, relax, or engage in recreational activities.', + } + ) # fmt: skip + EXPECTED_DECODED_TEXT = EXPECTED_DECODED_TEXTS.get_expectation() self.assertEqual( processor.decode(output[0], skip_special_tokens=True), @@ -415,10 +423,20 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase): output = model.generate(**inputs, max_new_tokens=20) - EXPECTED_DECODED_TEXT = [ - 'USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, there are a few things to be cautious about and items to bring.', - 'USER: \nWhat is this?\nASSISTANT: Cats' - ] # fmt: skip + EXPECTED_DECODED_TEXTS = Expectations( + { + ("cuda", 7): [ + 'USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, there are a few things to be cautious about and items to bring along', + 'USER: \nWhat is this?\nASSISTANT: Cats', + ], + ("cuda", 8): [ + 'USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, there are a few things to be cautious about and items to bring along', + 'USER: \nWhat is this?\nASSISTANT: Cats', + ], + } + ) # fmt: skip + EXPECTED_DECODED_TEXT = EXPECTED_DECODED_TEXTS.get_expectation() + self.assertEqual( self.processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT, @@ -481,11 +499,21 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase): model = model.eval() - EXPECTED_OUTPUT = [ - "\n \nUSER: What's the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while", - "\nUSER: Describe the image.\nASSISTANT: The image features a brown and white dog sitting on a sidewalk. The dog is holding a small", - "\nUSER: Describe the image.\nASSISTANT: The image features a lone llama standing on a grassy hill. The llama is the", - ] + EXPECTED_OUTPUTS = Expectations( + { + ("cuda", 7): [ + "\n \nUSER: What's the difference of two images?\nASSISTANT: The difference between the two images is that one of them has a dog standing on a field, while", + "\nUSER: Describe the image.\nASSISTANT: The image features a brown and white dog sitting on a sidewalk. The dog is holding a small", + "\nUSER: Describe the image.\nASSISTANT: The image features a lone llama standing on a grassy hill. The llama is the", + ], + ("cuda", 8): [ + "\n \nUSER: What's the difference of two images?\nASSISTANT: The difference between the two images is that one of them has a dog standing on a field, while", + '\nUSER: Describe the image.\nASSISTANT: The image features a beautiful blonde dog sitting on a sidewalk. The dog is holding a small', + '\nUSER: Describe the image.\nASSISTANT: The image features a lone llama standing on a grassy hill. The llama is the', + ], + } + ) # fmt: skip + EXPECTED_OUTPUT = EXPECTED_OUTPUTS.get_expectation() generate_ids = model.generate(**inputs, max_new_tokens=20) outputs = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) @@ -556,32 +584,23 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase): IMG_URLS = [ Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw), Image.open(requests.get("https://picsum.photos/id/231/200/300", stream=True).raw), - Image.open(requests.get("https://picsum.photos/id/27/500/500", stream=True).raw), - Image.open(requests.get("https://picsum.photos/id/17/150/600", stream=True).raw), ] - PROMPT = "[INST]Describe the images.\n[IMG][IMG][IMG][IMG][/INST]" + PROMPT = "[INST]Describe the images.\n[IMG][IMG][/INST]" # image = Image.open(requests.get(url, stream=True).raw) inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to(model.device) - generate_ids = model.generate(**inputs, max_new_tokens=500) + generate_ids = model.generate(**inputs, max_new_tokens=100) output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - print(output) # fmt: off EXPECTED_GENERATION = """ Describe the images. -Certainly! Here are the descriptions of the images: +The first image shows a black dog sitting on a wooden surface. The dog has a glossy coat and is looking directly at the camera with a calm expression. The wooden background appears to be made of weathered wooden planks, giving the image a rustic feel. -1. **Image 1**: This image features a black dog with a glossy coat sitting on a wooden surface. The dog has a calm and attentive expression, looking directly at the camera. The wooden background has a rustic appearance with visible grain and texture. - -2. **Image 2**: This image captures a breathtaking view of a mountainous landscape. The mountains are rugged and covered with patches of green vegetation. The sky above is clear, and the scene conveys a sense of tranquility and natural beauty. - -3. **Image 3**: This image shows a beach scene during sunset. The waves are gently rolling onto the shore, and several people can be seen in the water, possibly surfing or swimming. The sky is painted with warm hues of orange and yellow, creating a serene and picturesque atmosphere. - -4. **Image 4**: This image depicts a narrow, winding path that cuts through a lush, green landscape. On either side of the path, there is dense grass and various trees, including a prominent tree with white blossoms. The sky is clear and blue, adding to the peaceful and inviting ambiance of the scene. - -These descriptions provide a detailed overview of the content and atmosphere of each image. +The second image depicts a scenic mountain landscape. The mountains are rugged and covered with patches of green vegetation. The sky is clear, and the scene conveys a sense of tranquility and natural beauty. The mountains extend into the """ + # Remove the first and last empty character. + EXPECTED_GENERATION = EXPECTED_GENERATION[1:-1] # fmt: on # check that both inputs are handled correctly and generate the same output self.assertEqual(output, EXPECTED_GENERATION) @@ -603,12 +622,13 @@ These descriptions provide a detailed overview of the content and atmosphere of generate_ids = model.generate(**inputs, max_new_tokens=50) output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - EXPECTED_GENERATION = [ - # CUDA output - "Describe the images. The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which spans the entire image. The dog appears to be a black Labrador", - # XPU output - "Describe the images.The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which covers the entire background. The dog appears to be the main focus", - ] # fmt: skip + EXPECTED_GENERATIONS = Expectations( + { + ("cuda", 7): "Describe the images.The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which spans the entire image. The dog appears to be a black Labrador", + ("xpu", 3): "Describe the images.The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which covers the entire background. The dog appears to be the main focus", + } + ) # fmt: skip + EXPECTED_GENERATION = EXPECTED_GENERATIONS.get_expectation() self.assertTrue(output in EXPECTED_GENERATION) @slow