diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index ada1e82a534..5dbaec26ff7 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -2228,7 +2228,12 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
         # otherwise we expand manually by concatenating
         if getattr(self.config, "image_token_id", None) is not None:
             special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
+
+            inputs_embeds = inputs_embeds.to(language_model_inputs.device)
+            special_image_mask = special_image_mask.to(language_model_inputs.device)
             inputs_embeds[special_image_mask] = language_model_inputs.flatten()
+
+            attention_mask = attention_mask.to(language_attention_mask.device)
         else:
             logger.warning_once(
                 "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index af95bbb2c32..05322f01d26 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -1786,7 +1786,8 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
         generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
 
         # Test output
-        self.assertEqual(predictions[0].tolist(), [2, 102, 693, 2828, 15, 5, 4105, 19, 10, 2335, 50118])
+        expected_ids = [2, 102, 693, 2828, 15, 5, 4105, 19, 10, 2335, 50118]
+        self.assertEqual(predictions[0].tolist(), [50265] * 32 + expected_ids) # 50265 is the img token id
         self.assertEqual("a woman sitting on the beach with a dog", generated_text)
 
         # image and context
@@ -1797,10 +1798,8 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
         generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
 
         # Test output
-        self.assertEqual(
-            predictions[0].tolist(),
-            [2, 45641, 35, 61, 343, 16, 42, 116, 31652, 35, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118],
-        )
+        expected_ids = [2, 45641, 35, 61, 343, 16, 42, 116, 31652, 35, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118]
+        self.assertEqual(predictions[0].tolist(), [50265] * 32 + expected_ids) # 50265 is the img token id
         self.assertEqual(generated_text, "Question: which city is this? Answer: it's not a city, it's a beach")
 
     @require_torch_multi_accelerator
@@ -1826,8 +1825,18 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
         generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
 
         # Test output
-        self.assertEqual(predictions[0].tolist(), [0, 2335, 1556, 28, 1782, 30, 8, 2608, 1])
-        self.assertEqual("woman playing with dog on the beach", generated_text)
+        expected_ids_and_text = Expectations({
+            ("cuda", None): (
+                [0, 2335, 1556, 28, 1782, 30, 8, 2608, 1],
+                "woman playing with dog on the beach"
+            ),
+            ("rocm", (9, 5)): (
+                [0, 3, 9, 2335, 19, 1556, 28, 160, 1782, 30, 8, 2608, 1],
+                "a woman is playing with her dog on the beach"
+            )
+        }).get_expectation()
+        self.assertEqual(predictions[0].tolist(), expected_ids_and_text[0])
+        self.assertEqual(generated_text, expected_ids_and_text[1])
 
         # image and context
         prompt = "Question: which city is this? Answer:"
@@ -1837,11 +1846,18 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
         generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
 
         # Test output
-        self.assertEqual(
-            predictions[0].tolist(),
-            [0, 3, 7, 152, 67, 839, 1],
-        )
-        self.assertEqual(generated_text, "san diego")
+        expected_ids_and_text = Expectations({
+            ("cuda", None): (
+                [0, 3, 7, 152, 67, 839, 1],
+                "san diego"
+            ),
+            ("rocm", (9, 5)): (
+                [0, 3, 7, 152, 2515, 11389, 3523, 1],
+                "san francisco" # TODO: check if this is ok
+            )
+        }).get_expectation()
+        self.assertEqual(predictions[0].tolist(), expected_ids_and_text[0])
+        self.assertEqual(generated_text, expected_ids_and_text[1])
 
     def test_expansion_in_processing(self):
         processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")