mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-04 13:20:12 +06:00
Merge branch 'main' into static
This commit is contained in:
commit
09ef0d16ee
@ -176,7 +176,7 @@ class LlavaOnevisionConfig(PretrainedConfig):
|
|||||||
patch_size=14,
|
patch_size=14,
|
||||||
image_size=384,
|
image_size=384,
|
||||||
num_hidden_layers=26,
|
num_hidden_layers=26,
|
||||||
num_attention_heads=14,
|
num_attention_heads=16,
|
||||||
vision_use_head=False,
|
vision_use_head=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -434,6 +434,10 @@ class SmolVLMProcessor(ProcessorMixin):
|
|||||||
if chat_template is None and has_video:
|
if chat_template is None and has_video:
|
||||||
# re-assign to the correct default template for BC, if user is not requesting their own template
|
# re-assign to the correct default template for BC, if user is not requesting their own template
|
||||||
chat_template = DEFAULT_CHAT_TEMPLATE
|
chat_template = DEFAULT_CHAT_TEMPLATE
|
||||||
|
|
||||||
|
kwargs.setdefault("num_frames", self.video_processor.num_frames)
|
||||||
|
kwargs.setdefault("fps", self.video_processor.fps)
|
||||||
|
|
||||||
return super().apply_chat_template(conversation, chat_template, **kwargs)
|
return super().apply_chat_template(conversation, chat_template, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@ -536,23 +536,24 @@ class SmolVLMForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
).content
|
).content
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
self.image2 = Image.open(
|
|
||||||
BytesIO(requests.get("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg").content)
|
self.video_messages = [
|
||||||
)
|
{
|
||||||
self.image3 = Image.open(
|
"role": "user",
|
||||||
BytesIO(
|
"content": [
|
||||||
requests.get(
|
{
|
||||||
"https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"
|
"type": "video",
|
||||||
).content
|
"path": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assisted-generation/gif_1_1080p.mov",
|
||||||
)
|
},
|
||||||
)
|
{"type": "text", "text": "Describe this video in detail"},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
cleanup(torch_device, gc_collect=True)
|
cleanup(torch_device, gc_collect=True)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
# TODO (Orr?) this is a dummy test to check if the model generates things that make sense.
|
|
||||||
# Needs to be expanded to a tiny video
|
|
||||||
def test_integration_test(self):
|
def test_integration_test(self):
|
||||||
model = SmolVLMForConditionalGeneration.from_pretrained(
|
model = SmolVLMForConditionalGeneration.from_pretrained(
|
||||||
"HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
|
"HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
|
||||||
@ -571,3 +572,26 @@ class SmolVLMForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
|
|
||||||
expected_generated_text = "\n\n\n\nIn this image, we see a view of the Statue of Liberty and the"
|
expected_generated_text = "\n\n\n\nIn this image, we see a view of the Statue of Liberty and the"
|
||||||
self.assertEqual(generated_texts[0], expected_generated_text)
|
self.assertEqual(generated_texts[0], expected_generated_text)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_integration_test_video(self):
|
||||||
|
model = SmolVLMForConditionalGeneration.from_pretrained(
|
||||||
|
"HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
|
||||||
|
torch_dtype=torch.bfloat16,
|
||||||
|
device_map="auto",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create inputs
|
||||||
|
inputs = self.processor.apply_chat_template(
|
||||||
|
self.video_messages,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
tokenize=True,
|
||||||
|
return_dict=True,
|
||||||
|
return_tensors="pt",
|
||||||
|
).to(device=torch_device, dtype=torch.bfloat16)
|
||||||
|
|
||||||
|
generated_ids = model.generate(**inputs, max_new_tokens=20)
|
||||||
|
generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
|
||||||
|
|
||||||
|
expected_generated_text = 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video depicts a large language model architecture, specifically a language model with a "quick brown" feature' # fmt: skip
|
||||||
|
self.assertEqual(generated_texts[0], expected_generated_text)
|
||||||
|
Loading…
Reference in New Issue
Block a user