Add vision models to doc tests (#15905)

* Add vision models to doc tests

* Apply suggestions from code review

* Add more models

Co-authored-by: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
This commit is contained in:
NielsRogge 2022-03-03 19:46:31 +01:00 committed by GitHub
parent 742273a52a
commit 9251427c38
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 50 additions and 16 deletions

View File

@ -754,6 +754,7 @@ class BeitForMaskedImageModeling(BeitPreTrainedModel):
```python
>>> from transformers import BeitFeatureExtractor, BeitForMaskedImageModeling
>>> import torch
>>> from PIL import Image
>>> import requests
@ -763,9 +764,15 @@ class BeitForMaskedImageModeling(BeitPreTrainedModel):
>>> feature_extractor = BeitFeatureExtractor.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
>>> model = BeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
>>> inputs = feature_extractor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits = outputs.logits
>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
>>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
>>> loss, logits = outputs.loss, outputs.logits
>>> list(logits.shape)
[1, 196, 8192]
```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

View File

@ -587,19 +587,25 @@ class DeiTForMaskedImageModeling(DeiTPreTrainedModel):
Examples:
```python
>>> from transformers import DeiTFeatureExtractor, DeiTForMaskedImageModeling
>>> import torch
>>> from PIL import Image
>>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = DeiTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> model = DeiTForMaskedImageModeling.from_pretrained("google/vit-base-patch16-224-in21k")
>>> feature_extractor = DeiTFeatureExtractor.from_pretrained("facebook/deit-base-distilled-patch16-224")
>>> model = DeiTForMaskedImageModeling.from_pretrained("facebook/deit-base-distilled-patch16-224")
>>> inputs = feature_extractor(images=image, return_tensors="pt")
>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
>>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
>>> loss, reconstructed_pixel_values = outputs.loss, outputs.logits
>>> list(reconstructed_pixel_values.shape)
[1, 3, 224, 224]
```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

View File

@ -810,6 +810,7 @@ class SwinForMaskedImageModeling(SwinPreTrainedModel):
Examples:
```python
>>> from transformers import AutoFeatureExtractor, SwinForMaskedImageModeling
>>> import torch
>>> from PIL import Image
>>> import requests
@ -819,10 +820,15 @@ class SwinForMaskedImageModeling(SwinPreTrainedModel):
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
>>> model = SwinForMaskedImageModeling.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
>>> inputs = feature_extractor(images=image, return_tensors="pt")
>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
>>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
>>> loss, reconstructed_pixel_values = outputs.loss, outputs.logits
>>> list(reconstructed_pixel_values.shape)
[1, 3, 224, 224]
```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

View File

@ -624,6 +624,7 @@ class ViTForMaskedImageModeling(ViTPreTrainedModel):
Examples:
```python
>>> from transformers import ViTFeatureExtractor, ViTForMaskedImageModeling
>>> import torch
>>> from PIL import Image
>>> import requests
@ -633,10 +634,15 @@ class ViTForMaskedImageModeling(ViTPreTrainedModel):
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> model = ViTForMaskedImageModeling.from_pretrained("google/vit-base-patch16-224-in21k")
>>> inputs = feature_extractor(images=image, return_tensors="pt")
>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
>>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
>>> loss, reconstructed_pixel_values = outputs.loss, outputs.logits
>>> list(reconstructed_pixel_values.shape)
[1, 3, 224, 224]
```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

View File

@ -9,7 +9,16 @@ src/transformers/models/sew/modeling_sew.py
src/transformers/models/sew_d/modeling_sew_d.py
src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
src/transformers/models/speech_to_text/modeling_speech_to_text.py
src/transformers/models/speech_encoder_decoder/modeling_speech_enocder_decoder.py
src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
src/transformers/models/data2vec/modeling_data2vec_audio.py
src/transformers/models/vit/modeling_vit.py
src/transformers/models/beit/modeling_beit.py
src/transformers/models/deit/modeling_deit.py
src/transformers/models/swin/modeling_swin.py
src/transformers/models/convnext/modeling_convnext.py
src/transformers/models/poolformer/modeling_poolformer.py
src/transformers/models/vit_mae/modeling_vit_mae.py
src/transformers/models/segformer/modeling_segformer.py
src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
docs/source/quicktour.mdx
docs/source/task_summary.mdx
docs/source/task_summary.mdx