mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-03 03:31:05 +06:00
Fix Perceiver docs (#14879)
This commit is contained in:
parent
e37bc579fc
commit
7df4b90c76
@ -72,7 +72,7 @@ size of 262 byte IDs).
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/perceiver_architecture.jpg"
|
||||
alt="drawing" width="600"/>
|
||||
|
||||
<small> Perceiver IO architecture. Taken from the [original paper](https://arxiv.org/abs/2105.15203) </small>
|
||||
<small> Perceiver IO architecture. Taken from the <a href="https://arxiv.org/abs/2105.15203">original paper</a> </small>
|
||||
|
||||
This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found
|
||||
[here](https://github.com/deepmind/deepmind-research/tree/master/perceiver).
|
||||
|
@ -1881,14 +1881,29 @@ class PerceiverForMultimodalAutoencoding(PerceiverPreTrainedModel):
|
||||
```python
|
||||
>>> from transformers import PerceiverForMultimodalAutoencoding
|
||||
>>> import torch
|
||||
>>> import numpy as np
|
||||
|
||||
>>> # create multimodal inputs
|
||||
>>> images = torch.randn((1, 16, 3, 224, 224))
|
||||
>>> audio = torch.randn((1, 30720, 1))
|
||||
>>> inputs = dict(image=images, audio=audio, label=torch.zeros((images.shape[0], 700)))
|
||||
|
||||
>>> model = PerceiverForMultimodalAutoencoding.from_pretrained('deepmind/multimodal-perceiver')
|
||||
|
||||
>>> outputs = model(inputs=inputs)
|
||||
>>> # in the Perceiver IO paper, videos are auto-encoded in chunks
|
||||
>>> # each chunk subsamples different index dimensions of the image and audio modality decoder queries
|
||||
>>> nchunks = 128
|
||||
>>> image_chunk_size = np.prod((16, 224, 224)) // nchunks
|
||||
>>> audio_chunk_size = audio.shape[1] // model.config.samples_per_patch // nchunks
|
||||
>>> # process the first chunk
|
||||
>>> chunk_idx = 0
|
||||
>>> subsampling = {
|
||||
... "image": torch.arange(image_chunk_size * chunk_idx, image_chunk_size * (chunk_idx + 1)),
|
||||
... "audio": torch.arange(audio_chunk_size * chunk_idx, audio_chunk_size * (chunk_idx + 1)),
|
||||
... "label": None,
|
||||
... }
|
||||
|
||||
>>> outputs = model(inputs=inputs, subsampled_output_points=subsampling)
|
||||
>>> logits = outputs.logits
|
||||
```"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
Loading…
Reference in New Issue
Block a user