From 3bc65505fc0801e3d9ff741ec725fb0cb4d863d6 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Thu, 12 Oct 2023 10:01:07 +0200 Subject: [PATCH] Fix doctest for `Blip2ForConditionalGeneration` (#26737) * fix * fix * fix * fix * fix * fix * fix * fix * fix --------- Co-authored-by: ydshieh --- .../models/blip_2/modeling_blip_2.py | 79 ++++++------------- utils/slow_documentation_tests.txt | 1 + 2 files changed, 24 insertions(+), 56 deletions(-) diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index 87c8132ff4f..bd56b17e55c 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -1272,14 +1272,10 @@ class Blip2Model(Blip2PreTrainedModel): >>> import torch >>> from transformers import AutoTokenizer, Blip2Model - >>> device = "cuda" if torch.cuda.is_available() else "cpu" - - >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16) - - >>> model.to(device) # doctest: +IGNORE_RESULT + >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b") >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/blip2-opt-2.7b") - >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt").to(device) + >>> inputs = tokenizer(["a photo of a cat"], padding=True, return_tensors="pt") >>> text_features = model.get_text_features(**inputs) ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions @@ -1333,16 +1329,12 @@ class Blip2Model(Blip2PreTrainedModel): >>> import requests >>> from transformers import AutoProcessor, Blip2Model - >>> device = "cuda" if torch.cuda.is_available() else "cpu" - - >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16) - - >>> model.to(device) # doctest: +IGNORE_RESULT + >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b") >>> processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b") >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) - >>> inputs = processor(images=image, return_tensors="pt").to(device, torch.float16) + >>> inputs = processor(images=image, return_tensors="pt") >>> image_outputs = model.get_image_features(**inputs) ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions @@ -1381,15 +1373,12 @@ class Blip2Model(Blip2PreTrainedModel): >>> import requests >>> from transformers import Blip2Processor, Blip2Model - >>> device = "cuda" if torch.cuda.is_available() else "cpu" - >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b") - >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16) - >>> model.to(device) # doctest: +IGNORE_RESULT + >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b") >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) - >>> inputs = processor(images=image, return_tensors="pt").to(device, torch.float16) + >>> inputs = processor(images=image, return_tensors="pt") >>> qformer_outputs = model.get_qformer_features(**inputs) ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions @@ -1654,34 +1643,7 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel): Examples: - Image captioning (without providing a text prompt): - - ```python - >>> from PIL import Image - >>> import requests - >>> from transformers import Blip2Processor, Blip2ForConditionalGeneration - >>> import torch - - >>> device = "cuda" if torch.cuda.is_available() else "cpu" - - >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b") - >>> model = Blip2ForConditionalGeneration.from_pretrained( - ... "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16 - ... ) - >>> model.to(device) # doctest: +IGNORE_RESULT - - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - - >>> inputs = processor(images=image, return_tensors="pt").to(device, torch.float16) - - >>> generated_ids = model.generate(**inputs) - >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() - >>> print(generated_text) - two cats laying on a couch - ``` - - Visual question answering (prompt = question): + Prepare processor, model and image input ```python >>> from PIL import Image @@ -1698,7 +1660,22 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel): >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) + ``` + Image captioning (without providing a text prompt): + + ```python + >>> inputs = processor(images=image, return_tensors="pt").to(device, torch.float16) + + >>> generated_ids = model.generate(**inputs) + >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() + >>> print(generated_text) + two cats laying on a couch + ``` + + Visual question answering (prompt = question): + + ```python >>> prompt = "Question: how many cats are there? Answer:" >>> inputs = processor(images=image, text=prompt, return_tensors="pt").to(device="cuda", dtype=torch.float16) @@ -1712,20 +1689,10 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel): This greatly reduces the amount of memory used by the model while maintaining the same performance. ```python - >>> from PIL import Image - >>> import requests - >>> from transformers import Blip2Processor, Blip2ForConditionalGeneration - >>> import torch - - >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl") >>> model = Blip2ForConditionalGeneration.from_pretrained( - ... "Salesforce/blip2-flan-t5-xl", load_in_8bit=True, device_map={"": 0}, torch_dtype=torch.bfloat16 + ... "Salesforce/blip2-opt-2.7b", load_in_8bit=True, device_map={"": 0}, torch_dtype=torch.bfloat16 ... ) # doctest: +IGNORE_RESULT - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - - >>> prompt = "Question: how many cats are there? Answer:" >>> inputs = processor(images=image, text=prompt, return_tensors="pt").to(device="cuda", dtype=torch.bfloat16) >>> generated_ids = model.generate(**inputs) diff --git a/utils/slow_documentation_tests.txt b/utils/slow_documentation_tests.txt index f72216b1345..7e2db3f2a5f 100644 --- a/utils/slow_documentation_tests.txt +++ b/utils/slow_documentation_tests.txt @@ -1,4 +1,5 @@ docs/source/en/generation_strategies.md docs/source/en/model_doc/ctrl.md docs/source/en/task_summary.md +src/transformers/models/blip_2/modeling_blip_2.py src/transformers/models/ctrl/modeling_ctrl.py