[InstructBLIP] Fix bos token of LLaMa checkpoints (#24492)

* Add fix * Fix doctest
2025-08-03 03:31:05 +06:00 · 2023-07-11 21:43:01 +02:00 · 2023-07-11 21:43:01 +02:00 · bb13a92859
commit bb13a92859
parent aac4c79968
2 changed files with 8 additions and 3 deletions
--- a/src/transformers/models/instructblip/modeling_instructblip.py
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@ -1360,7 +1360,7 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel):
        >>> processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
-        >>> model.to(device)
+        >>> model.to(device)  # doctest: +IGNORE_RESULT

        >>> url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
@ -1380,7 +1380,7 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel):
        ... )
        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        >>> print(generated_text)
-        What is unusual about this image? The image is unusual because it depicts a person standing on top of a car, which is parked on the side of the road. This is an unusual position for a person to be in, as they are typically not expected to stand on top of a car while it is parked. Additionally, the person in the image appears to be wearing a suit and tie, which is not typical attire for someone who is standing on top of a car. It is unclear why the person is in this unusual position or what they are doing there.
+        The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV, which is parked in the middle of a busy city street. This is an unconventional approach to ironing clothes, as it requires the man to balance himself and his ironing equipment on top of the vehicle while navigating through traffic. Additionally, the presence of taxis and other vehicles in the scene further emphasizes the unusual nature of this situation.
        ```"""
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

@ -1554,4 +1554,10 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel):
            **generate_kwargs,
        )

+        # the InstructBLIP authors used inconsistent tokenizer/model files during training,
+        # with the tokenizer's bos token being set to </s> which has ID=2,
+        # whereas the model's text config has bos token id = 0
+        if self.config.text_config.architectures[0] == "LLaMAForCausalLM":
+            outputs[outputs == 0] = 2
+
        return outputs
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@ -544,7 +544,6 @@ class InstructBlipModelIntegrationTest(unittest.TestCase):

        # verify generation
        outputs = model.generate(**inputs, max_new_tokens=30)
-        outputs[outputs == 0] = 2
        generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()

        # fmt: off