From 5964f820db1568d26298b37dea9db328185c7f7c Mon Sep 17 00:00:00 2001
From: Maria Khalusova <kafooster@gmail.com>
Date: Fri, 3 Nov 2023 10:57:03 -0400
Subject: [PATCH] [Docs] Model_doc structure/clarity improvements (#26876)

* first batch of structure improvements for model_docs

* second batch of structure improvements for model_docs

* more structure improvements for model_docs

* more structure improvements for model_docs

* structure improvements for cv model_docs

* more structural refactoring

* addressed feedback about image processors
---
 docs/source/en/model_doc/albert.md            | 26 ++++--
 docs/source/en/model_doc/align.md             |  9 +-
 docs/source/en/model_doc/altclip.md           | 11 +--
 .../audio-spectrogram-transformer.md          | 18 ++--
 docs/source/en/model_doc/autoformer.md        |  2 -
 docs/source/en/model_doc/bark.md              | 13 ++-
 docs/source/en/model_doc/bart.md              | 36 ++++----
 docs/source/en/model_doc/barthez.md           |  8 +-
 docs/source/en/model_doc/bartpho.md           |  8 +-
 docs/source/en/model_doc/beit.md              | 17 +++-
 docs/source/en/model_doc/bert-generation.md   | 15 ++--
 docs/source/en/model_doc/bert-japanese.md     | 12 ++-
 docs/source/en/model_doc/bert.md              | 32 +++++--
 docs/source/en/model_doc/bertweet.md          | 11 ++-
 docs/source/en/model_doc/big_bird.md          | 20 ++++-
 docs/source/en/model_doc/bigbird_pegasus.md   |  8 +-
 docs/source/en/model_doc/biogpt.md            | 10 +--
 docs/source/en/model_doc/bit.md               | 11 ++-
 docs/source/en/model_doc/blenderbot-small.md  | 25 ++++--
 docs/source/en/model_doc/blenderbot.md        | 50 ++++++-----
 docs/source/en/model_doc/blip-2.md            | 10 +--
 docs/source/en/model_doc/blip.md              | 22 +++--
 docs/source/en/model_doc/bloom.md             | 22 +++--
 docs/source/en/model_doc/bort.md              | 14 ++--
 docs/source/en/model_doc/bridgetower.md       |  7 +-
 docs/source/en/model_doc/bros.md              | 11 ++-
 docs/source/en/model_doc/byt5.md              |  8 +-
 docs/source/en/model_doc/camembert.md         | 24 ++++--
 docs/source/en/model_doc/canine.md            | 23 ++---
 docs/source/en/model_doc/chinese_clip.md      |  8 +-
 docs/source/en/model_doc/clap.md              |  2 -
 docs/source/en/model_doc/clip.md              | 19 ++++-
 docs/source/en/model_doc/clipseg.md           | 14 ++--
 docs/source/en/model_doc/code_llama.md        | 23 +++--
 docs/source/en/model_doc/codegen.md           |  4 +-
 docs/source/en/model_doc/conditional_detr.md  |  2 +-
 docs/source/en/model_doc/convbert.md          | 17 +++-
 docs/source/en/model_doc/convnext.md          | 13 +--
 docs/source/en/model_doc/convnextv2.md        |  4 -
 docs/source/en/model_doc/cpm.md               |  9 +-
 docs/source/en/model_doc/cpmant.md            |  5 +-
 docs/source/en/model_doc/ctrl.md              | 18 +++-
 docs/source/en/model_doc/cvt.md               | 14 +++-
 docs/source/en/model_doc/data2vec.md          | 21 +++--
 docs/source/en/model_doc/deberta-v2.md        | 11 ++-
 docs/source/en/model_doc/deberta.md           | 10 +++
 .../en/model_doc/decision_transformer.md      |  4 +-
 docs/source/en/model_doc/deformable_detr.md   |  9 +-
 docs/source/en/model_doc/deit.md              | 22 ++---
 docs/source/en/model_doc/deplot.md            | 12 ++-
 docs/source/en/model_doc/deta.md              |  7 --
 docs/source/en/model_doc/detr.md              | 20 +++--
 docs/source/en/model_doc/dialogpt.md          |  9 +-
 docs/source/en/model_doc/dinat.md             | 22 ++---
 docs/source/en/model_doc/dinov2.md            |  5 --
 docs/source/en/model_doc/distilbert.md        | 23 ++++-
 docs/source/en/model_doc/dit.md               | 15 ++--
 docs/source/en/model_doc/donut.md             |  4 +-
 docs/source/en/model_doc/dpr.md               | 13 ++-
 docs/source/en/model_doc/efficientformer.md   |  9 ++
 docs/source/en/model_doc/electra.md           | 20 ++++-
 docs/source/en/model_doc/encodec.md           |  4 +-
 docs/source/en/model_doc/encoder-decoder.md   | 12 +++
 docs/source/en/model_doc/ernie.md             |  6 +-
 docs/source/en/model_doc/ernie_m.md           | 19 ++---
 docs/source/en/model_doc/esm.md               | 22 +++--
 docs/source/en/model_doc/flan-t5.md           |  8 +-
 docs/source/en/model_doc/flan-ul2.md          | 10 +--
 docs/source/en/model_doc/flaubert.md          | 14 +++-
 docs/source/en/model_doc/flava.md             |  2 -
 docs/source/en/model_doc/fnet.md              | 14 ++--
 docs/source/en/model_doc/focalnet.md          |  5 --
 docs/source/en/model_doc/fsmt.md              |  3 -
 docs/source/en/model_doc/funnel.md            | 17 +++-
 docs/source/en/model_doc/git.md               |  9 +-
 docs/source/en/model_doc/glpn.md              |  4 -
 docs/source/en/model_doc/gpt-sw3.md           | 17 ++--
 docs/source/en/model_doc/gpt2.md              | 26 ++++--
 docs/source/en/model_doc/gpt_bigcode.md       |  8 +-
 docs/source/en/model_doc/gpt_neo.md           | 16 +++-
 docs/source/en/model_doc/gpt_neox.md          |  4 +-
 docs/source/en/model_doc/gpt_neox_japanese.md |  4 +-
 docs/source/en/model_doc/gptj.md              | 15 +++-
 docs/source/en/model_doc/gptsan-japanese.md   |  4 +-
 docs/source/en/model_doc/graphormer.md        |  8 +-
 docs/source/en/model_doc/groupvit.md          | 19 +++--
 docs/source/en/model_doc/herbert.md           | 13 ++-
 docs/source/en/model_doc/hubert.md            | 17 +++-
 docs/source/en/model_doc/ibert.md             |  2 +-
 docs/source/en/model_doc/idefics.md           |  4 +-
 docs/source/en/model_doc/imagegpt.md          |  6 +-
 docs/source/en/model_doc/informer.md          |  2 -
 docs/source/en/model_doc/instructblip.md      |  7 +-
 docs/source/en/model_doc/jukebox.md           |  8 +-
 docs/source/en/model_doc/layoutlm.md          | 13 ++-
 docs/source/en/model_doc/layoutlmv2.md        |  2 +-
 docs/source/en/model_doc/layoutlmv3.md        | 30 ++++---
 docs/source/en/model_doc/layoutxlm.md         | 10 ++-
 docs/source/en/model_doc/led.md               | 20 ++++-
 docs/source/en/model_doc/levit.md             |  7 +-
 docs/source/en/model_doc/lilt.md              | 18 ++--
 docs/source/en/model_doc/llama.md             |  9 +-
 docs/source/en/model_doc/llama2.md            |  6 +-
 docs/source/en/model_doc/longformer.md        | 17 +++-
 docs/source/en/model_doc/longt5.md            | 18 +++-
 docs/source/en/model_doc/luke.md              | 18 ++--
 docs/source/en/model_doc/lxmert.md            | 17 +++-
 docs/source/en/model_doc/m2m_100.md           | 16 ++--
 docs/source/en/model_doc/marian.md            | 25 ++++--
 docs/source/en/model_doc/markuplm.md          | 11 +--
 docs/source/en/model_doc/mask2former.md       | 17 ++--
 docs/source/en/model_doc/maskformer.md        | 15 ++--
 docs/source/en/model_doc/matcha.md            |  8 +-
 docs/source/en/model_doc/mbart.md             | 14 +++-
 docs/source/en/model_doc/mctct.md             | 13 ++-
 docs/source/en/model_doc/mega.md              | 10 ++-
 docs/source/en/model_doc/megatron-bert.md     | 12 +--
 docs/source/en/model_doc/megatron_gpt2.md     | 14 +++-
 docs/source/en/model_doc/mgp-str.md           | 10 +--
 docs/source/en/model_doc/mistral.md           |  8 +-
 docs/source/en/model_doc/mluke.md             |  8 +-
 docs/source/en/model_doc/mms.md               | 12 ++-
 docs/source/en/model_doc/mobilebert.md        | 16 +++-
 docs/source/en/model_doc/mobilenet_v1.md      |  6 +-
 docs/source/en/model_doc/mobilenet_v2.md      |  6 +-
 docs/source/en/model_doc/mobilevit.md         | 16 +++-
 docs/source/en/model_doc/mobilevitv2.md       |  9 +-
 docs/source/en/model_doc/mpnet.md             | 21 +++--
 docs/source/en/model_doc/mpt.md               |  5 +-
 docs/source/en/model_doc/mra.md               |  6 --
 docs/source/en/model_doc/mt5.md               | 13 ++-
 docs/source/en/model_doc/mvp.md               | 10 ++-
 docs/source/en/model_doc/nat.md               | 24 +++---
 docs/source/en/model_doc/nezha.md             |  2 +-
 docs/source/en/model_doc/nllb-moe.md          | 12 +--
 docs/source/en/model_doc/nllb.md              |  7 +-
 docs/source/en/model_doc/nougat.md            |  8 +-
 docs/source/en/model_doc/nystromformer.md     |  2 +-
 docs/source/en/model_doc/oneformer.md         | 15 ++--
 docs/source/en/model_doc/open-llama.md        |  2 +-
 docs/source/en/model_doc/openai-gpt.md        | 20 +++--
 docs/source/en/model_doc/opt.md               | 39 +++++----
 docs/source/en/model_doc/owlv2.md             | 18 ++--
 docs/source/en/model_doc/owlvit.md            |  3 +-
 docs/source/en/model_doc/pegasus.md           | 35 ++++----
 docs/source/en/model_doc/pegasus_x.md         | 12 +--
 docs/source/en/model_doc/perceiver.md         | 15 ++--
 docs/source/en/model_doc/persimmon.md         |  6 +-
 docs/source/en/model_doc/phobert.md           | 11 ++-
 docs/source/en/model_doc/pix2struct.md        |  1 -
 docs/source/en/model_doc/plbart.md            | 13 ++-
 docs/source/en/model_doc/poolformer.md        |  5 +-
 docs/source/en/model_doc/pop2piano.md         | 20 ++---
 docs/source/en/model_doc/prophetnet.md        | 12 +--
 docs/source/en/model_doc/qdqbert.md           | 12 +--
 docs/source/en/model_doc/rag.md               | 17 +++-
 docs/source/en/model_doc/reformer.md          | 14 ++--
 docs/source/en/model_doc/regnet.md            | 22 +++--
 docs/source/en/model_doc/rembert.md           | 13 ++-
 docs/source/en/model_doc/resnet.md            | 16 ++--
 .../en/model_doc/roberta-prelayernorm.md      | 22 +++--
 docs/source/en/model_doc/roberta.md           | 18 +++-
 docs/source/en/model_doc/roc_bert.md          | 12 +--
 docs/source/en/model_doc/roformer.md          | 24 ++++--
 docs/source/en/model_doc/rwkv.md              |  3 +-
 docs/source/en/model_doc/segformer.md         | 11 ++-
 docs/source/en/model_doc/sew-d.md             |  8 +-
 docs/source/en/model_doc/sew.md               |  8 +-
 docs/source/en/model_doc/speech_to_text.md    | 12 ++-
 docs/source/en/model_doc/speech_to_text_2.md  |  5 +-
 docs/source/en/model_doc/splinter.md          |  8 +-
 docs/source/en/model_doc/squeezebert.md       |  8 +-
 docs/source/en/model_doc/swiftformer.md       |  5 --
 docs/source/en/model_doc/swin.md              | 17 ++--
 docs/source/en/model_doc/swinv2.md            |  3 -
 .../en/model_doc/switch_transformers.md       |  9 +-
 docs/source/en/model_doc/t5.md                | 29 ++++---
 docs/source/en/model_doc/t5v1.1.md            | 12 ++-
 docs/source/en/model_doc/table-transformer.md |  9 +-
 docs/source/en/model_doc/tapas.md             | 17 +++-
 docs/source/en/model_doc/tapex.md             | 12 ++-
 .../en/model_doc/time_series_transformer.md   | 15 +---
 docs/source/en/model_doc/timesformer.md       | 11 +--
 .../en/model_doc/trajectory_transformer.md    |  7 +-
 docs/source/en/model_doc/transfo-xl.md        | 16 +++-
 docs/source/en/model_doc/trocr.md             |  2 +-
 docs/source/en/model_doc/tvlt.md              | 16 ++--
 docs/source/en/model_doc/ul2.md               | 14 +++-
 docs/source/en/model_doc/umt5.md              | 19 +++--
 docs/source/en/model_doc/unispeech-sat.md     | 10 +--
 docs/source/en/model_doc/unispeech.md         | 10 +--
 docs/source/en/model_doc/upernet.md           | 22 ++---
 docs/source/en/model_doc/van.md               |  4 +-
 docs/source/en/model_doc/videomae.md          |  8 +-
 docs/source/en/model_doc/vilt.md              | 20 ++---
 .../en/model_doc/vision-encoder-decoder.md    | 12 +++
 .../en/model_doc/vision-text-dual-encoder.md  | 12 +++
 docs/source/en/model_doc/visual_bert.md       |  8 +-
 docs/source/en/model_doc/vit.md               | 84 +++++++++----------
 docs/source/en/model_doc/vit_hybrid.md        |  3 -
 docs/source/en/model_doc/vit_mae.md           | 27 +++---
 docs/source/en/model_doc/vit_msn.md           | 17 ++--
 docs/source/en/model_doc/vitdet.md            |  7 +-
 docs/source/en/model_doc/vitmatte.md          |  8 +-
 docs/source/en/model_doc/vits.md              |  3 +-
 docs/source/en/model_doc/vivit.md             |  1 -
 .../source/en/model_doc/wav2vec2-conformer.md | 10 +--
 docs/source/en/model_doc/wav2vec2.md          | 18 +++-
 docs/source/en/model_doc/wav2vec2_phoneme.md  | 19 +++--
 docs/source/en/model_doc/wavlm.md             | 14 ++--
 docs/source/en/model_doc/whisper.md           | 18 ++--
 docs/source/en/model_doc/xglm.md              | 16 +++-
 docs/source/en/model_doc/xlm-prophetnet.md    |  8 +-
 docs/source/en/model_doc/xlm-roberta-xl.md    | 14 ++--
 docs/source/en/model_doc/xlm-roberta.md       | 25 ++++--
 docs/source/en/model_doc/xlm-v.md             | 11 ++-
 docs/source/en/model_doc/xlm.md               | 19 ++++-
 docs/source/en/model_doc/xlnet.md             | 17 +++-
 docs/source/en/model_doc/xls_r.md             | 12 ++-
 docs/source/en/model_doc/xlsr_wav2vec2.md     |  8 +-
 docs/source/en/model_doc/xmod.md              |  8 +-
 docs/source/en/model_doc/yolos.md             | 12 +--
 docs/source/en/model_doc/yoso.md              | 13 +--
 223 files changed, 1796 insertions(+), 1116 deletions(-)

diff --git a/docs/source/en/model_doc/albert.md b/docs/source/en/model_doc/albert.md
index 9e821f2f4d0..b7a819b2ed4 100644
--- a/docs/source/en/model_doc/albert.md
+++ b/docs/source/en/model_doc/albert.md
@@ -45,7 +45,10 @@ self-supervised loss that focuses on modeling inter-sentence coherence, and show
 with multi-sentence inputs. As a result, our best model establishes new state-of-the-art results on the GLUE, RACE, and
 SQuAD benchmarks while having fewer parameters compared to BERT-large.*
 
-Tips:
+This model was contributed by [lysandre](https://huggingface.co/lysandre). This model jax version was contributed by
+[kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/google-research/ALBERT).
+
+## Usage tips
 
 - ALBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather
   than the left.
@@ -56,11 +59,7 @@ Tips:
 - Layers are split in groups that share parameters (to save memory).
 Next sentence prediction is replaced by a sentence ordering prediction: in the inputs, we have two sentences A and B (that are consecutive) and we either feed A followed by B or B followed by A. The model must predict if they have been swapped or not.
 
-
-This model was contributed by [lysandre](https://huggingface.co/lysandre). This model jax version was contributed by
-[kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/google-research/ALBERT).
-
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
@@ -90,6 +89,9 @@ This model was contributed by [lysandre](https://huggingface.co/lysandre). This
 
 [[autodoc]] models.albert.modeling_tf_albert.TFAlbertForPreTrainingOutput
 
+<frameworkcontent>
+<pt>
+
 ## AlbertModel
 
 [[autodoc]] AlbertModel
@@ -124,6 +126,10 @@ This model was contributed by [lysandre](https://huggingface.co/lysandre). This
 [[autodoc]] AlbertForQuestionAnswering
     - forward
 
+</pt>
+
+<tf>
+
 ## TFAlbertModel
 
 [[autodoc]] TFAlbertModel
@@ -159,6 +165,9 @@ This model was contributed by [lysandre](https://huggingface.co/lysandre). This
 [[autodoc]] TFAlbertForQuestionAnswering
     - call
 
+</tf>
+<jax>
+
 ## FlaxAlbertModel
 
 [[autodoc]] FlaxAlbertModel
@@ -193,3 +202,8 @@ This model was contributed by [lysandre](https://huggingface.co/lysandre). This
 
 [[autodoc]] FlaxAlbertForQuestionAnswering
     - __call__
+
+</jax>
+</frameworkcontent>
+
+
diff --git a/docs/source/en/model_doc/align.md b/docs/source/en/model_doc/align.md
index faf76853f60..5e41dac6024 100644
--- a/docs/source/en/model_doc/align.md
+++ b/docs/source/en/model_doc/align.md
@@ -24,7 +24,10 @@ The abstract from the paper is the following:
 
 *Pre-trained representations are becoming crucial for many NLP and perception tasks. While representation learning in NLP has transitioned to training on raw text without human annotations, visual and vision-language representations still rely heavily on curated training datasets that are expensive or require expert knowledge. For vision applications, representations are mostly learned using datasets with explicit class labels such as ImageNet or OpenImages. For vision-language, popular datasets like Conceptual Captions, MSCOCO, or CLIP all involve a non-trivial data collection (and cleaning) process. This costly curation process limits the size of datasets and hence hinders the scaling of trained models. In this paper, we leverage a noisy dataset of over one billion image alt-text pairs, obtained without expensive filtering or post-processing steps in the Conceptual Captions dataset. A simple dual-encoder architecture learns to align visual and language representations of the image and text pairs using a contrastive loss. We show that the scale of our corpus can make up for its noise and leads to state-of-the-art representations even with such a simple learning scheme. Our visual representation achieves strong performance when transferred to classification tasks such as ImageNet and VTAB. The aligned visual and language representations enables zero-shot image classification and also set new state-of-the-art results on Flickr30K and MSCOCO image-text retrieval benchmarks, even when compared with more sophisticated cross-attention models. The representations also enable cross-modality search with complex text and text + image queries.*
 
-## Usage
+This model was contributed by [Alara Dirik](https://huggingface.co/adirik).
+The original code is not released, this implementation is based on the Kakao Brain implementation based on the original paper.
+
+## Usage example
 
 ALIGN uses EfficientNet to get visual features and BERT to get the text features. Both the text and visual features are then projected to a latent space with identical dimension. The dot product between the projected image and text features is then used as a similarity score.
 
@@ -56,9 +59,6 @@ probs = logits_per_image.softmax(dim=1)
 print(probs)
 ```
 
-This model was contributed by [Alara Dirik](https://huggingface.co/adirik).
-The original code is not released, this implementation is based on the Kakao Brain implementation based on the original paper.
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ALIGN.
@@ -69,7 +69,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it. The resource should ideally demonstrate something new instead of duplicating an existing resource.
 
-
 ## AlignConfig
 
 [[autodoc]] AlignConfig
diff --git a/docs/source/en/model_doc/altclip.md b/docs/source/en/model_doc/altclip.md
index 23cdcb63fbd..b1fc9b38269 100644
--- a/docs/source/en/model_doc/altclip.md
+++ b/docs/source/en/model_doc/altclip.md
@@ -31,7 +31,9 @@ teacher learning and contrastive learning. We validate our method through evalua
 performances on a bunch of tasks including ImageNet-CN, Flicker30k- CN, and COCO-CN. Further, we obtain very close performances with 
 CLIP on almost all tasks, suggesting that one can simply alter the text encoder in CLIP for extended capabilities such as multilingual understanding.*
 
-## Usage
+This model was contributed by [jongjyh](https://huggingface.co/jongjyh).
+
+## Usage tips and example
 
 The usage of AltCLIP is very similar to the CLIP. the difference between CLIP is the text encoder. Note that we use bidirectional attention instead of casual attention
 and we take the [CLS] token in XLM-R to represent text embedding.
@@ -50,7 +52,6 @@ The [`AltCLIPProcessor`] wraps a [`CLIPImageProcessor`] and a [`XLMRobertaTokeni
 encode the text and prepare the images. The following example shows how to get the image-text similarity scores using
 [`AltCLIPProcessor`] and [`AltCLIPModel`].
 
-
 ```python
 >>> from PIL import Image
 >>> import requests
@@ -70,11 +71,11 @@ encode the text and prepare the images. The following example shows how to get t
 >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
 ```
 
-Tips:
+<Tip>
 
-This model is build on `CLIPModel`, so use it like a original CLIP. 
+This model is based on `CLIPModel`, use it like you would use the original [CLIP](clip).
 
-This model was contributed by [jongjyh](https://huggingface.co/jongjyh).
+</Tip>
 
 ## AltCLIPConfig
 
diff --git a/docs/source/en/model_doc/audio-spectrogram-transformer.md b/docs/source/en/model_doc/audio-spectrogram-transformer.md
index df9fe78c2d4..587ec85d09b 100644
--- a/docs/source/en/model_doc/audio-spectrogram-transformer.md
+++ b/docs/source/en/model_doc/audio-spectrogram-transformer.md
@@ -26,15 +26,6 @@ The abstract from the paper is the following:
 
 *In the past decade, convolutional neural networks (CNNs) have been widely adopted as the main building block for end-to-end audio classification models, which aim to learn a direct mapping from audio spectrograms to corresponding labels. To better capture long-range global context, a recent trend is to add a self-attention mechanism on top of the CNN, forming a CNN-attention hybrid model. However, it is unclear whether the reliance on a CNN is necessary, and if neural networks purely based on attention are sufficient to obtain good performance in audio classification. In this paper, we answer the question by introducing the Audio Spectrogram Transformer (AST), the first convolution-free, purely attention-based model for audio classification. We evaluate AST on various audio classification benchmarks, where it achieves new state-of-the-art results of 0.485 mAP on AudioSet, 95.6% accuracy on ESC-50, and 98.1% accuracy on Speech Commands V2.*
 
-Tips:
-
-- When fine-tuning the Audio Spectrogram Transformer (AST) on your own dataset, it's recommended to take care of the input normalization (to make
-sure the input has mean of 0 and std of 0.5). [`ASTFeatureExtractor`] takes care of this. Note that it uses the AudioSet
-mean and std by default. You can check [`ast/src/get_norm_stats.py`](https://github.com/YuanGongND/ast/blob/master/src/get_norm_stats.py) to see how
-the authors compute the stats for a downstream dataset.
-- Note that the AST needs a low learning rate (the authors use a 10 times smaller learning rate compared to their CNN model proposed in the
-[PSLA paper](https://arxiv.org/abs/2102.01243)) and converges quickly, so please search for a suitable learning rate and learning rate scheduler for your task.
-
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/audio_spectogram_transformer_architecture.png"
 alt="drawing" width="600"/>
 
@@ -43,6 +34,15 @@ alt="drawing" width="600"/>
 This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/YuanGongND/ast).
 
+## Usage tips
+
+- When fine-tuning the Audio Spectrogram Transformer (AST) on your own dataset, it's recommended to take care of the input normalization (to make
+sure the input has mean of 0 and std of 0.5). [`ASTFeatureExtractor`] takes care of this. Note that it uses the AudioSet
+mean and std by default. You can check [`ast/src/get_norm_stats.py`](https://github.com/YuanGongND/ast/blob/master/src/get_norm_stats.py) to see how
+the authors compute the stats for a downstream dataset.
+- Note that the AST needs a low learning rate (the authors use a 10 times smaller learning rate compared to their CNN model proposed in the
+[PSLA paper](https://arxiv.org/abs/2102.01243)) and converges quickly, so please search for a suitable learning rate and learning rate scheduler for your task.
+
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with the Audio Spectrogram Transformer.
diff --git a/docs/source/en/model_doc/autoformer.md b/docs/source/en/model_doc/autoformer.md
index 20977c71cae..bb423e941c7 100644
--- a/docs/source/en/model_doc/autoformer.md
+++ b/docs/source/en/model_doc/autoformer.md
@@ -39,13 +39,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] AutoformerConfig
 
-
 ## AutoformerModel
 
 [[autodoc]] AutoformerModel
     - forward
 
-
 ## AutoformerForPrediction
 
 [[autodoc]] AutoformerForPrediction
diff --git a/docs/source/en/model_doc/bark.md b/docs/source/en/model_doc/bark.md
index e287df13fe0..0d9127d917d 100644
--- a/docs/source/en/model_doc/bark.md
+++ b/docs/source/en/model_doc/bark.md
@@ -14,8 +14,7 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-Bark is a transformer-based text-to-speech model proposed by Suno AI in [suno-ai/bark](https://github.com/suno-ai/bark). 
-
+Bark is a transformer-based text-to-speech model proposed by Suno AI in [suno-ai/bark](https://github.com/suno-ai/bark).
 
 Bark is made of 4 main models:
 
@@ -26,6 +25,9 @@ Bark is made of 4 main models:
 
 It should be noted that each of the first three modules can support conditional speaker embeddings to condition the output sound according to specific predefined voice.
 
+This model was contributed by [Yoach Lacombe (ylacombe)](https://huggingface.co/ylacombe) and [Sanchit Gandhi (sanchit-gandhi)](https://github.com/sanchit-gandhi).
+The original code can be found [here](https://github.com/suno-ai/bark).
+
 ### Optimizing Bark
 
 Bark can be optimized with just a few extra lines of code, which **significantly reduces its memory footprint** and **accelerates inference**.
@@ -86,7 +88,7 @@ model.enable_cpu_offload()
 
 Find out more on inference optimization techniques [here](https://huggingface.co/docs/transformers/perf_infer_gpu_one).
 
-### Tips
+### Usage tips
 
 Suno offers a library of voice presets in a number of languages [here](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c).
 These presets are also uploaded in the hub [here](https://huggingface.co/suno/bark-small/tree/main/speaker_embeddings) or [here](https://huggingface.co/suno/bark/tree/main/speaker_embeddings).
@@ -142,11 +144,6 @@ To save the audio, simply take the sample rate from the model config and some sc
 >>> write_wav("bark_generation.wav", sample_rate, audio_array)
 ```
 
-
-This model was contributed by [Yoach Lacombe (ylacombe)](https://huggingface.co/ylacombe) and [Sanchit Gandhi (sanchit-gandhi)](https://github.com/sanchit-gandhi).
-The original code can be found [here](https://github.com/suno-ai/bark).
-
-
 ## BarkConfig
 
 [[autodoc]] BarkConfig
diff --git a/docs/source/en/model_doc/bart.md b/docs/source/en/model_doc/bart.md
index dcf149fd85e..7986228915c 100644
--- a/docs/source/en/model_doc/bart.md
+++ b/docs/source/en/model_doc/bart.md
@@ -25,9 +25,6 @@ rendered properly in your Markdown viewer.
 </a>
 </div>
 
-**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign
-@patrickvonplaten
-
 ## Overview
 
 The Bart model was proposed in [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation,
@@ -45,7 +42,9 @@ According to the abstract,
   state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains
   of up to 6 ROUGE.
 
-Tips:
+This model was contributed by [sshleifer](https://huggingface.co/sshleifer). The authors' code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/bart).
+
+## Usage tips:
 
 - BART is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
   the left.
@@ -57,18 +56,6 @@ Tips:
   * permute sentences
   * rotate the document to make it start at a specific token
 
-This model was contributed by [sshleifer](https://huggingface.co/sshleifer). The Authors' code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/bart).
-
-
-### Examples
-
-- Examples and scripts for fine-tuning BART and other models for sequence to sequence tasks can be found in
-  [examples/pytorch/summarization/](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md).
-- An example of how to train [`BartForConditionalGeneration`] with a Hugging Face `datasets`
-  object can be found in this [forum discussion](https://discuss.huggingface.co/t/train-bart-for-conditional-generation-e-g-summarization/1904).
-- [Distilled checkpoints](https://huggingface.co/models?search=distilbart) are described in this [paper](https://arxiv.org/abs/2010.13002).
-
-
 ## Implementation Notes
 
 - Bart doesn't use `token_type_ids` for sequence classification. Use [`BartTokenizer`] or
@@ -112,6 +99,7 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 - [`BartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb).
 - [`TFBartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb).
 - [`FlaxBartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/summarization).
+- An example of how to train [`BartForConditionalGeneration`] with a Hugging Face `datasets` object can be found in this [forum discussion](https://discuss.huggingface.co/t/train-bart-for-conditional-generation-e-g-summarization/1904)
 - [Summarization](https://huggingface.co/course/chapter7/5?fw=pt#summarization) chapter of the 🤗 Hugging Face course.
 - [Summarization task guide](../tasks/summarization)
 
@@ -134,6 +122,7 @@ See also:
 - [Text classification task guide](../tasks/sequence_classification)
 - [Question answering task guide](../tasks/question_answering)
 - [Causal language modeling task guide](../tasks/language_modeling)
+- [Distilled checkpoints](https://huggingface.co/models?search=distilbart) are described in this [paper](https://arxiv.org/abs/2010.13002).
 
 ## BartConfig
 
@@ -150,6 +139,10 @@ See also:
 [[autodoc]] BartTokenizerFast
     - all
 
+
+<frameworkcontent>
+<pt>
+
 ## BartModel
 
 [[autodoc]] BartModel
@@ -175,6 +168,9 @@ See also:
 [[autodoc]] BartForCausalLM
     - forward
 
+</pt>
+<tf>
+
 ## TFBartModel
 
 [[autodoc]] TFBartModel
@@ -190,6 +186,9 @@ See also:
 [[autodoc]] TFBartForSequenceClassification
     - call
 
+</tf>
+<jax>
+
 ## FlaxBartModel
 
 [[autodoc]] FlaxBartModel
@@ -222,3 +221,8 @@ See also:
 
 [[autodoc]] FlaxBartForCausalLM
     - __call__
+</jax>
+</frameworkcontent>
+
+
+
diff --git a/docs/source/en/model_doc/barthez.md b/docs/source/en/model_doc/barthez.md
index fdeb8e2fed2..1b571e242f4 100644
--- a/docs/source/en/model_doc/barthez.md
+++ b/docs/source/en/model_doc/barthez.md
@@ -38,8 +38,14 @@ provides a significant boost over vanilla BARThez, and is on par with or outperf
 
 This model was contributed by [moussakam](https://huggingface.co/moussakam). The Authors' code can be found [here](https://github.com/moussaKam/BARThez).
 
+<Tip> 
 
-### Examples
+BARThez implementation is the same as BART, except for tokenization. Refer to [BART documentation](bart) for information on 
+configuration classes and their parameters. BARThez-specific tokenizers are documented below.  
+
+</Tip>
+
+## Resources
 
 - BARThez can be fine-tuned on sequence-to-sequence tasks in a similar way as BART, check:
   [examples/pytorch/summarization/](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md).
diff --git a/docs/source/en/model_doc/bartpho.md b/docs/source/en/model_doc/bartpho.md
index 3529c11a7ed..8f0a5f8bfe2 100644
--- a/docs/source/en/model_doc/bartpho.md
+++ b/docs/source/en/model_doc/bartpho.md
@@ -29,7 +29,9 @@ on a downstream task of Vietnamese text summarization show that in both automati
 outperforms the strong baseline mBART and improves the state-of-the-art. We release BARTpho to facilitate future
 research and applications of generative Vietnamese NLP tasks.*
 
-Example of use:
+This model was contributed by [dqnguyen](https://huggingface.co/dqnguyen). The original code can be found [here](https://github.com/VinAIResearch/BARTpho).
+
+## Usage example
 
 ```python
 >>> import torch
@@ -54,7 +56,7 @@ Example of use:
 >>> features = bartpho(**input_ids)
 ```
 
-Tips:
+## Usage tips
 
 - Following mBART, BARTpho uses the "large" architecture of BART with an additional layer-normalization layer on top of
   both the encoder and decoder. Thus, usage examples in the [documentation of BART](bart), when adapting to use
@@ -79,8 +81,6 @@ Tips:
   Other languages, if employing this pre-trained multilingual SentencePiece model "vocab_file" for subword
   segmentation, can reuse BartphoTokenizer with their own language-specialized "monolingual_vocab_file".
 
-This model was contributed by [dqnguyen](https://huggingface.co/dqnguyen). The original code can be found [here](https://github.com/VinAIResearch/BARTpho).
-
 ## BartphoTokenizer
 
 [[autodoc]] BartphoTokenizer
diff --git a/docs/source/en/model_doc/beit.md b/docs/source/en/model_doc/beit.md
index 69586724713..f7605ebcdf9 100644
--- a/docs/source/en/model_doc/beit.md
+++ b/docs/source/en/model_doc/beit.md
@@ -39,7 +39,10 @@ with previous pre-training methods. For example, base-size BEiT achieves 83.2% t
 significantly outperforming from-scratch DeiT training (81.8%) with the same setup. Moreover, large-size BEiT obtains
 86.3% only using ImageNet-1K, even outperforming ViT-L with supervised pre-training on ImageNet-22K (85.2%).*
 
-Tips:
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The JAX/FLAX version of this model was
+contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/beit).
+
+## Usage tips
 
 - BEiT models are regular Vision Transformers, but pre-trained in a self-supervised way rather than supervised. They
   outperform both the [original model (ViT)](vit) as well as [Data-efficient Image Transformers (DeiT)](deit) when fine-tuned on ImageNet-1K and CIFAR-100. You can check out demo notebooks regarding inference as well as
@@ -68,9 +71,6 @@ alt="drawing" width="600"/>
 
 <small> BEiT pre-training. Taken from the <a href="https://arxiv.org/abs/2106.08254">original paper.</a> </small>
 
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The JAX/FLAX version of this model was
-contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/beit).
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BEiT.
@@ -107,6 +107,9 @@ If you're interested in submitting a resource to be included here, please feel f
     - preprocess
     - post_process_semantic_segmentation
 
+<frameworkcontent>
+<pt>
+
 ## BeitModel
 
 [[autodoc]] BeitModel
@@ -127,6 +130,9 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] BeitForSemanticSegmentation
     - forward
 
+</pt>
+<jax>
+
 ## FlaxBeitModel
 
 [[autodoc]] FlaxBeitModel
@@ -141,3 +147,6 @@ If you're interested in submitting a resource to be included here, please feel f
 
 [[autodoc]] FlaxBeitForImageClassification
     - __call__
+
+</jax>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/en/model_doc/bert-generation.md b/docs/source/en/model_doc/bert-generation.md
index 9cc7bac6c7e..7edbf38694e 100644
--- a/docs/source/en/model_doc/bert-generation.md
+++ b/docs/source/en/model_doc/bert-generation.md
@@ -33,10 +33,13 @@ GPT-2 and RoBERTa checkpoints and conducted an extensive empirical study on the
 encoder and decoder, with these checkpoints. Our models result in new state-of-the-art results on Machine Translation,
 Text Summarization, Sentence Splitting, and Sentence Fusion.*
 
-Usage:
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The original code can be
+found [here](https://tfhub.dev/s?module-type=text-generation&subtype=module,placeholder).
 
-- The model can be used in combination with the [`EncoderDecoderModel`] to leverage two pretrained
-  BERT checkpoints for subsequent fine-tuning.
+## Usage examples and tips
+
+The model can be used in combination with the [`EncoderDecoderModel`] to leverage two pretrained BERT checkpoints for 
+subsequent fine-tuning:
 
 ```python
 >>> # leverage checkpoints for Bert2Bert model...
@@ -61,8 +64,7 @@ Usage:
 >>> loss.backward()
 ```
 
-- Pretrained [`EncoderDecoderModel`] are also directly available in the model hub, e.g.,
-
+Pretrained [`EncoderDecoderModel`] are also directly available in the model hub, e.g.:
 
 ```python
 >>> # instantiate sentence fusion model
@@ -85,9 +87,6 @@ Tips:
 - For summarization, sentence splitting, sentence fusion and translation, no special tokens are required for the input.
   Therefore, no EOS token should be added to the end of the input.
 
-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The original code can be
-found [here](https://tfhub.dev/s?module-type=text-generation&subtype=module,placeholder).
-
 ## BertGenerationConfig
 
 [[autodoc]] BertGenerationConfig
diff --git a/docs/source/en/model_doc/bert-japanese.md b/docs/source/en/model_doc/bert-japanese.md
index 208b775307a..d68bb221d57 100644
--- a/docs/source/en/model_doc/bert-japanese.md
+++ b/docs/source/en/model_doc/bert-japanese.md
@@ -67,12 +67,16 @@ Example of using a model with Character tokenization:
 >>> outputs = bertjapanese(**inputs)
 ```
 
-Tips:
-
-- This implementation is the same as BERT, except for tokenization method. Refer to the [documentation of BERT](bert) for more usage examples.
-
 This model was contributed by [cl-tohoku](https://huggingface.co/cl-tohoku).
 
+<Tip> 
+
+This implementation is the same as BERT, except for tokenization method. Refer to [BERT documentation](bert) for 
+API reference information.  
+
+</Tip>
+
+
 ## BertJapaneseTokenizer
 
 [[autodoc]] BertJapaneseTokenizer
diff --git a/docs/source/en/model_doc/bert.md b/docs/source/en/model_doc/bert.md
index 19d15cfc05a..bdf4566b43a 100644
--- a/docs/source/en/model_doc/bert.md
+++ b/docs/source/en/model_doc/bert.md
@@ -45,7 +45,9 @@ language processing tasks, including pushing the GLUE score to 80.5% (7.7% point
 accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute
 improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).*
 
-Tips:
+This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/google-research/bert).
+
+## Usage tips
 
 - BERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
   the left.
@@ -59,10 +61,6 @@ Tips:
     
 - The model must predict the original sentence, but has a second objective: inputs are two sentences A and B (with a separation token in between). With probability 50%, the sentences are consecutive in the corpus, in the remaining 50% they are not related. The model has to predict if the sentences are consecutive or not.
 
-
-
-This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/google-research/bert).
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BERT. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
@@ -137,14 +135,23 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
     - create_token_type_ids_from_sequences
     - save_vocabulary
 
+<frameworkcontent>
+<pt>
+
 ## BertTokenizerFast
 
 [[autodoc]] BertTokenizerFast
 
+</pt>
+<tf>
+
 ## TFBertTokenizer
 
 [[autodoc]] TFBertTokenizer
 
+</tf>
+</frameworkcontent>
+
 ## Bert specific outputs
 
 [[autodoc]] models.bert.modeling_bert.BertForPreTrainingOutput
@@ -153,6 +160,10 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
 
+
+<frameworkcontent>
+<pt>
+
 ## BertModel
 
 [[autodoc]] BertModel
@@ -198,6 +209,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] BertForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFBertModel
 
 [[autodoc]] TFBertModel
@@ -243,6 +257,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] TFBertForQuestionAnswering
     - call
 
+</tf>
+<jax>
+
 ## FlaxBertModel
 
 [[autodoc]] FlaxBertModel
@@ -287,3 +304,8 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] FlaxBertForQuestionAnswering
     - __call__
+
+</jax>
+</frameworkcontent>
+
+
diff --git a/docs/source/en/model_doc/bertweet.md b/docs/source/en/model_doc/bertweet.md
index 50629445aee..c4c883b21ad 100644
--- a/docs/source/en/model_doc/bertweet.md
+++ b/docs/source/en/model_doc/bertweet.md
@@ -28,7 +28,9 @@ al., 2019). Experiments show that BERTweet outperforms strong baselines RoBERTa-
 2020), producing better performance results than the previous state-of-the-art models on three Tweet NLP tasks:
 Part-of-speech tagging, Named-entity recognition and text classification.*
 
-Example of use:
+This model was contributed by [dqnguyen](https://huggingface.co/dqnguyen). The original code can be found [here](https://github.com/VinAIResearch/BERTweet).
+
+## Usage example
 
 ```python
 >>> import torch
@@ -55,7 +57,12 @@ Example of use:
 >>> # bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base")
 ```
 
-This model was contributed by [dqnguyen](https://huggingface.co/dqnguyen). The original code can be found [here](https://github.com/VinAIResearch/BERTweet).
+<Tip> 
+
+This implementation is the same as BERT, except for tokenization method. Refer to [BERT documentation](bert) for 
+API reference information.  
+
+</Tip>
 
 ## BertweetTokenizer
 
diff --git a/docs/source/en/model_doc/big_bird.md b/docs/source/en/model_doc/big_bird.md
index b8bbb388d6e..3d1ef91d560 100644
--- a/docs/source/en/model_doc/big_bird.md
+++ b/docs/source/en/model_doc/big_bird.md
@@ -41,7 +41,10 @@ sequence as part of the sparse attention mechanism. The proposed sparse attentio
 BigBird drastically improves performance on various NLP tasks such as question answering and summarization. We also
 propose novel applications to genomics data.*
 
-Tips:
+This model was contributed by [vasudevgupta](https://huggingface.co/vasudevgupta). The original code can be found
+[here](https://github.com/google-research/bigbird).
+
+## Usage tips
 
 - For an in-detail explanation on how BigBird's attention works, see [this blog post](https://huggingface.co/blog/big-bird).
 - BigBird comes with 2 implementations: **original_full** & **block_sparse**. For the sequence length < 1024, using
@@ -53,10 +56,8 @@ Tips:
 - BigBird is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
   the left.
 
-This model was contributed by [vasudevgupta](https://huggingface.co/vasudevgupta). The original code can be found
-[here](https://github.com/google-research/bigbird).
 
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
@@ -85,6 +86,9 @@ This model was contributed by [vasudevgupta](https://huggingface.co/vasudevgupta
 
 [[autodoc]] models.big_bird.modeling_big_bird.BigBirdForPreTrainingOutput
 
+<frameworkcontent>
+<pt>
+
 ## BigBirdModel
 
 [[autodoc]] BigBirdModel
@@ -125,6 +129,9 @@ This model was contributed by [vasudevgupta](https://huggingface.co/vasudevgupta
 [[autodoc]] BigBirdForQuestionAnswering
     - forward
 
+</pt>
+<jax>
+
 ## FlaxBigBirdModel
 
 [[autodoc]] FlaxBigBirdModel
@@ -164,3 +171,8 @@ This model was contributed by [vasudevgupta](https://huggingface.co/vasudevgupta
 
 [[autodoc]] FlaxBigBirdForQuestionAnswering
     - __call__
+
+</jax>
+</frameworkcontent>
+
+
diff --git a/docs/source/en/model_doc/bigbird_pegasus.md b/docs/source/en/model_doc/bigbird_pegasus.md
index d767f548a76..003e5643719 100644
--- a/docs/source/en/model_doc/bigbird_pegasus.md
+++ b/docs/source/en/model_doc/bigbird_pegasus.md
@@ -41,7 +41,9 @@ sequence as part of the sparse attention mechanism. The proposed sparse attentio
 BigBird drastically improves performance on various NLP tasks such as question answering and summarization. We also
 propose novel applications to genomics data.*
 
-Tips:
+The original code can be found [here](https://github.com/google-research/bigbird).
+
+## Usage tips
 
 - For an in-detail explanation on how BigBird's attention works, see [this blog post](https://huggingface.co/blog/big-bird).
 - BigBird comes with 2 implementations: **original_full** & **block_sparse**. For the sequence length < 1024, using
@@ -54,9 +56,7 @@ Tips:
 - BigBird is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
   the left.
 
-The original code can be found [here](https://github.com/google-research/bigbird).
-
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Question answering task guide](../tasks/question_answering)
diff --git a/docs/source/en/model_doc/biogpt.md b/docs/source/en/model_doc/biogpt.md
index 29327df21a0..1cac6d10990 100644
--- a/docs/source/en/model_doc/biogpt.md
+++ b/docs/source/en/model_doc/biogpt.md
@@ -25,15 +25,15 @@ The abstract from the paper is the following:
 
 *Pre-trained language models have attracted increasing attention in the biomedical domain, inspired by their great success in the general natural language domain. Among the two main branches of pre-trained language models in the general language domain, i.e. BERT (and its variants) and GPT (and its variants), the first one has been extensively studied in the biomedical domain, such as BioBERT and PubMedBERT. While they have achieved great success on a variety of discriminative downstream biomedical tasks, the lack of generation ability constrains their application scope. In this paper, we propose BioGPT, a domain-specific generative Transformer language model pre-trained on large-scale biomedical literature. We evaluate BioGPT on six biomedical natural language processing tasks and demonstrate that our model outperforms previous models on most tasks. Especially, we get 44.98%, 38.42% and 40.76% F1 score on BC5CDR, KD-DTI and DDI end-to-end relation extraction tasks, respectively, and 78.2% accuracy on PubMedQA, creating a new record. Our case study on text generation further demonstrates the advantage of BioGPT on biomedical literature to generate fluent descriptions for biomedical terms.*
 
-Tips:
+This model was contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/microsoft/BioGPT).
 
-- BioGPT is a model with absolute position embeddings so it’s usually advised to pad the inputs on the right rather than the left.
+## Usage tips
+
+- BioGPT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than the left.
 - BioGPT was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next token in a sequence. Leveraging this feature allows BioGPT to generate syntactically coherent text as it can be observed in the run_generation.py example script.
 - The model can take the `past_key_values` (for PyTorch) as input, which is the previously computed key/value attention pairs. Using this (past_key_values or past) value prevents the model from re-computing pre-computed values in the context of text generation. For PyTorch, see past_key_values argument of the BioGptForCausalLM.forward() method for more information on its usage.
 
-This model was contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/microsoft/BioGPT).
-
-## Documentation resources
+## Resources
 
 - [Causal language modeling task guide](../tasks/language_modeling)
 
diff --git a/docs/source/en/model_doc/bit.md b/docs/source/en/model_doc/bit.md
index 80b9fdd2caf..7f8a8ea67c4 100644
--- a/docs/source/en/model_doc/bit.md
+++ b/docs/source/en/model_doc/bit.md
@@ -25,15 +25,15 @@ The abstract from the paper is the following:
 
 *Transfer of pre-trained representations improves sample efficiency and simplifies hyperparameter tuning when training deep neural networks for vision. We revisit the paradigm of pre-training on large supervised datasets and fine-tuning the model on a target task. We scale up pre-training, and propose a simple recipe that we call Big Transfer (BiT). By combining a few carefully selected components, and transferring using a simple heuristic, we achieve strong performance on over 20 datasets. BiT performs well across a surprisingly wide range of data regimes -- from 1 example per class to 1M total examples. BiT achieves 87.5% top-1 accuracy on ILSVRC-2012, 99.4% on CIFAR-10, and 76.3% on the 19 task Visual Task Adaptation Benchmark (VTAB). On small datasets, BiT attains 76.8% on ILSVRC-2012 with 10 examples per class, and 97.0% on CIFAR-10 with 10 examples per class. We conduct detailed analysis of the main components that lead to high transfer performance.*
 
-Tips:
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/google-research/big_transfer).
+
+## Usage tips
 
 - BiT models are equivalent to ResNetv2 in terms of architecture, except that: 1) all batch normalization layers are replaced by [group normalization](https://arxiv.org/abs/1803.08494),
 2) [weight standardization](https://arxiv.org/abs/1903.10520) is used for convolutional layers. The authors show that the combination of both is useful for training with large batch sizes, and has a significant
 impact on transfer learning.
 
-This model was contributed by [nielsr](https://huggingface.co/nielsr).
-The original code can be found [here](https://github.com/google-research/big_transfer).
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BiT.
@@ -62,5 +62,4 @@ If you're interested in submitting a resource to be included here, please feel f
 ## BitForImageClassification
 
 [[autodoc]] BitForImageClassification
-    - forward
-
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/blenderbot-small.md b/docs/source/en/model_doc/blenderbot-small.md
index c126bc9b145..d5f4a7d849b 100644
--- a/docs/source/en/model_doc/blenderbot-small.md
+++ b/docs/source/en/model_doc/blenderbot-small.md
@@ -40,15 +40,16 @@ and code publicly available. Human evaluations show our best models are superior
 dialogue in terms of engagingness and humanness measurements. We then discuss the limitations of this work by analyzing
 failure cases of our models.*
 
-Tips:
-
-- Blenderbot Small is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
-  the left.
-
 This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The authors' code can be
 found [here](https://github.com/facebookresearch/ParlAI).
 
-## Documentation resources
+## Usage tips
+
+Blenderbot Small is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than 
+the left.
+
+
+## Resources
 
 - [Causal language modeling task guide](../tasks/language_modeling)
 - [Translation task guide](../tasks/translation)
@@ -70,6 +71,9 @@ found [here](https://github.com/facebookresearch/ParlAI).
 
 [[autodoc]] BlenderbotSmallTokenizerFast
 
+<frameworkcontent>
+<pt>
+
 ## BlenderbotSmallModel
 
 [[autodoc]] BlenderbotSmallModel
@@ -85,6 +89,9 @@ found [here](https://github.com/facebookresearch/ParlAI).
 [[autodoc]] BlenderbotSmallForCausalLM
     - forward
 
+</pt>
+<tf>
+
 ## TFBlenderbotSmallModel
 
 [[autodoc]] TFBlenderbotSmallModel
@@ -95,6 +102,9 @@ found [here](https://github.com/facebookresearch/ParlAI).
 [[autodoc]] TFBlenderbotSmallForConditionalGeneration
     - call
 
+</tf>
+<jax>
+
 ## FlaxBlenderbotSmallModel
 
 [[autodoc]] FlaxBlenderbotSmallModel
@@ -108,3 +118,6 @@ found [here](https://github.com/facebookresearch/ParlAI).
     - __call__
     - encode
     - decode
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/blenderbot.md b/docs/source/en/model_doc/blenderbot.md
index 5a10af77b69..42e1710cb2d 100644
--- a/docs/source/en/model_doc/blenderbot.md
+++ b/docs/source/en/model_doc/blenderbot.md
@@ -16,8 +16,6 @@ rendered properly in your Markdown viewer.
 
 # Blenderbot
 
-**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) .
-
 ## Overview
 
 The Blender chatbot model was proposed in [Recipes for building an open-domain chatbot](https://arxiv.org/pdf/2004.13637.pdf) Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu,
@@ -36,26 +34,14 @@ and code publicly available. Human evaluations show our best models are superior
 dialogue in terms of engagingness and humanness measurements. We then discuss the limitations of this work by analyzing
 failure cases of our models.*
 
-Tips:
-
-- Blenderbot is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
-  the left.
-
 This model was contributed by [sshleifer](https://huggingface.co/sshleifer). The authors' code can be found [here](https://github.com/facebookresearch/ParlAI) .
 
+## Usage tips and example
 
-## Implementation Notes
+Blenderbot is a model with absolute position embeddings so it's usually advised to pad the inputs on the right 
+rather than the left.
 
-- Blenderbot uses a standard [seq2seq model transformer](https://arxiv.org/pdf/1706.03762.pdf) based architecture.
-- Available checkpoints can be found in the [model hub](https://huggingface.co/models?search=blenderbot).
-- This is the *default* Blenderbot model class. However, some smaller checkpoints, such as
-  `facebook/blenderbot_small_90M`, have a different architecture and consequently should be used with
-  [BlenderbotSmall](blenderbot-small).
-
-
-## Usage
-
-Here is an example of model usage:
+An example:
 
 ```python
 >>> from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
@@ -70,7 +56,16 @@ Here is an example of model usage:
 ["<s> That's unfortunate. Are they trying to lose weight or are they just trying to be healthier?</s>"]
 ```
 
-## Documentation resources
+## Implementation Notes
+
+- Blenderbot uses a standard [seq2seq model transformer](https://arxiv.org/pdf/1706.03762.pdf) based architecture.
+- Available checkpoints can be found in the [model hub](https://huggingface.co/models?search=blenderbot).
+- This is the *default* Blenderbot model class. However, some smaller checkpoints, such as
+  `facebook/blenderbot_small_90M`, have a different architecture and consequently should be used with
+  [BlenderbotSmall](blenderbot-small).
+
+  
+## Resources
 
 - [Causal language modeling task guide](../tasks/language_modeling)
 - [Translation task guide](../tasks/translation)
@@ -90,9 +85,13 @@ Here is an example of model usage:
 [[autodoc]] BlenderbotTokenizerFast
     - build_inputs_with_special_tokens
 
+
+<frameworkcontent>
+<pt>
+
 ## BlenderbotModel
 
-See `transformers.BartModel` for arguments to *forward* and *generate*
+See [`~transformers.BartModel`] for arguments to *forward* and *generate*
 
 [[autodoc]] BlenderbotModel
     - forward
@@ -109,6 +108,9 @@ See [`~transformers.BartForConditionalGeneration`] for arguments to *forward* an
 [[autodoc]] BlenderbotForCausalLM
     - forward
 
+</pt>
+<tf>
+
 ## TFBlenderbotModel
 
 [[autodoc]] TFBlenderbotModel
@@ -119,6 +121,9 @@ See [`~transformers.BartForConditionalGeneration`] for arguments to *forward* an
 [[autodoc]] TFBlenderbotForConditionalGeneration
     - call
 
+</tf>
+<jax>
+
 ## FlaxBlenderbotModel
 
 [[autodoc]] FlaxBlenderbotModel
@@ -132,3 +137,8 @@ See [`~transformers.BartForConditionalGeneration`] for arguments to *forward* an
     - __call__
     - encode
     - decode
+
+</jax>
+</frameworkcontent>
+
+
diff --git a/docs/source/en/model_doc/blip-2.md b/docs/source/en/model_doc/blip-2.md
index 0890e612561..d2a47e7af8f 100644
--- a/docs/source/en/model_doc/blip-2.md
+++ b/docs/source/en/model_doc/blip-2.md
@@ -27,11 +27,6 @@ The abstract from the paper is the following:
 
 *The cost of vision-and-language pre-training has become increasingly prohibitive due to end-to-end training of large-scale models. This paper proposes BLIP-2, a generic and efficient pre-training strategy that bootstraps vision-language pre-training from off-the-shelf frozen pre-trained image encoders and frozen large language models. BLIP-2 bridges the modality gap with a lightweight Querying Transformer, which is pre-trained in two stages. The first stage bootstraps vision-language representation learning from a frozen image encoder. The second stage bootstraps vision-to-language generative learning from a frozen language model. BLIP-2 achieves state-of-the-art performance on various vision-language tasks, despite having significantly fewer trainable parameters than existing methods. For example, our model outperforms Flamingo80B by 8.7% on zero-shot VQAv2 with 54x fewer trainable parameters. We also demonstrate the model's emerging capabilities of zero-shot image-to-text generation that can follow natural language instructions.*
 
-Tips:
-
-- BLIP-2 can be used for conditional text generation given an image and an optional text prompt. At inference time, it's recommended to use the [`generate`] method.
-- One can use [`Blip2Processor`] to prepare images for the model, and decode the predicted tokens ID's back to text.
-
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/blip2_architecture.jpg"
 alt="drawing" width="600"/> 
 
@@ -40,6 +35,11 @@ alt="drawing" width="600"/>
 This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/salesforce/LAVIS/tree/5ee63d688ba4cebff63acee04adaef2dee9af207).
 
+## Usage tips
+
+- BLIP-2 can be used for conditional text generation given an image and an optional text prompt. At inference time, it's recommended to use the [`generate`] method.
+- One can use [`Blip2Processor`] to prepare images for the model, and decode the predicted tokens ID's back to text.
+
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BLIP-2.
diff --git a/docs/source/en/model_doc/blip.md b/docs/source/en/model_doc/blip.md
index 8afed63311f..bc122c942a6 100644
--- a/docs/source/en/model_doc/blip.md
+++ b/docs/source/en/model_doc/blip.md
@@ -20,7 +20,7 @@ rendered properly in your Markdown viewer.
 
 The BLIP model was proposed in [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
 
-BLIP is a model that is able to perform various multi-modal tasks including
+BLIP is a model that is able to perform various multi-modal tasks including:
 - Visual Question Answering 
 - Image-Text retrieval (Image-text matching)
 - Image Captioning
@@ -39,7 +39,6 @@ The original code can be found [here](https://github.com/salesforce/BLIP).
 
 - [Jupyter notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb) on how to fine-tune BLIP for image captioning on a custom dataset
 
-
 ## BlipConfig
 
 [[autodoc]] BlipConfig
@@ -57,12 +56,14 @@ The original code can be found [here](https://github.com/salesforce/BLIP).
 
 [[autodoc]] BlipProcessor
 
-
 ## BlipImageProcessor
 
 [[autodoc]] BlipImageProcessor
     - preprocess
 
+<frameworkcontent>
+<pt>
+
 ## BlipModel
 
 [[autodoc]] BlipModel
@@ -75,30 +76,29 @@ The original code can be found [here](https://github.com/salesforce/BLIP).
 [[autodoc]] BlipTextModel
     - forward
 
-
 ## BlipVisionModel
 
 [[autodoc]] BlipVisionModel
     - forward
 
-
 ## BlipForConditionalGeneration
 
 [[autodoc]] BlipForConditionalGeneration
     - forward
 
-
 ## BlipForImageTextRetrieval
 
 [[autodoc]] BlipForImageTextRetrieval
     - forward
 
-
 ## BlipForQuestionAnswering
 
 [[autodoc]] BlipForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFBlipModel
 
 [[autodoc]] TFBlipModel
@@ -111,26 +111,24 @@ The original code can be found [here](https://github.com/salesforce/BLIP).
 [[autodoc]] TFBlipTextModel
     - call
 
-
 ## TFBlipVisionModel
 
 [[autodoc]] TFBlipVisionModel
     - call
 
-
 ## TFBlipForConditionalGeneration
 
 [[autodoc]] TFBlipForConditionalGeneration
     - call
 
-
 ## TFBlipForImageTextRetrieval
 
 [[autodoc]] TFBlipForImageTextRetrieval
     - call
 
-
 ## TFBlipForQuestionAnswering
 
 [[autodoc]] TFBlipForQuestionAnswering
-    - call
\ No newline at end of file
+    - call
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/bloom.md b/docs/source/en/model_doc/bloom.md
index 3c155fa5878..a1d39d13ad0 100644
--- a/docs/source/en/model_doc/bloom.md
+++ b/docs/source/en/model_doc/bloom.md
@@ -56,16 +56,20 @@ See also:
 [[autodoc]] BloomConfig
     - all
 
-## BloomModel
-
-[[autodoc]] BloomModel
-    - forward
-
 ## BloomTokenizerFast
 
 [[autodoc]] BloomTokenizerFast
     - all
 
+
+<frameworkcontent>
+<pt>
+
+## BloomModel
+
+[[autodoc]] BloomModel
+    - forward
+
 ## BloomForCausalLM
 
 [[autodoc]] BloomForCausalLM
@@ -86,6 +90,9 @@ See also:
 [[autodoc]] BloomForQuestionAnswering
     - forward
 
+</pt>
+<jax>
+
 ## FlaxBloomModel
 
 [[autodoc]] FlaxBloomModel
@@ -95,3 +102,8 @@ See also:
 
 [[autodoc]] FlaxBloomForCausalLM
     - __call__
+
+</jax>
+</frameworkcontent>
+
+
diff --git a/docs/source/en/model_doc/bort.md b/docs/source/en/model_doc/bort.md
index dccf2b560b6..1542d464d9f 100644
--- a/docs/source/en/model_doc/bort.md
+++ b/docs/source/en/model_doc/bort.md
@@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
 
 <Tip warning={true}>
 
-This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+This model is in maintenance mode only, we do not accept any new PRs changing its code.
 
 If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
 You can do so by running the following command: `pip install -U transformers==4.30.0`.
@@ -43,13 +43,15 @@ hardware. It is also 7.9x faster on a CPU, as well as being better performing th
 architecture, and some of the non-compressed variants: it obtains performance improvements of between 0.3% and 31%,
 absolute, with respect to BERT-large, on multiple public natural language understanding (NLU) benchmarks.*
 
-Tips:
+This model was contributed by [stefan-it](https://huggingface.co/stefan-it). The original code can be found [here](https://github.com/alexa/bort/).
 
-- BORT's model architecture is based on BERT, so one can refer to [BERT's documentation page](bert) for the
-  model's API as well as usage examples.
-- BORT uses the RoBERTa tokenizer instead of the BERT tokenizer, so one can refer to [RoBERTa's documentation page](roberta) for the tokenizer's API as well as usage examples.
+## Usage tips
+
+- BORT's model architecture is based on BERT, refer to [BERT's documentation page](bert) for the
+  model's API reference as well as usage examples.
+- BORT uses the RoBERTa tokenizer instead of the BERT tokenizer, refer to [RoBERTa's documentation page](roberta) for the tokenizer's API reference as well as usage examples.
 - BORT requires a specific fine-tuning algorithm, called [Agora](https://adewynter.github.io/notes/bort_algorithms_and_applications.html#fine-tuning-with-algebraic-topology) ,
   that is sadly not open-sourced yet. It would be very useful for the community, if someone tries to implement the
   algorithm to make BORT fine-tuning work.
 
-This model was contributed by [stefan-it](https://huggingface.co/stefan-it). The original code can be found [here](https://github.com/alexa/bort/).
+
diff --git a/docs/source/en/model_doc/bridgetower.md b/docs/source/en/model_doc/bridgetower.md
index ba98cea91d2..013fea06c27 100644
--- a/docs/source/en/model_doc/bridgetower.md
+++ b/docs/source/en/model_doc/bridgetower.md
@@ -37,7 +37,9 @@ alt="drawing" width="600"/>
 
 <small> BridgeTower architecture. Taken from the <a href="https://arxiv.org/abs/2206.08657">original paper.</a> </small>
 
-## Usage
+This model was contributed by [Anahita Bhiwandiwalla](https://huggingface.co/anahita-b), [Tiep Le](https://huggingface.co/Tile) and [Shaoyen Tseng](https://huggingface.co/shaoyent). The original code can be found [here](https://github.com/microsoft/BridgeTower).
+
+## Usage tips and examples
 
 BridgeTower consists of a visual encoder, a textual encoder and cross-modal encoder with multiple lightweight bridge layers.
 The goal of this approach was to build a bridge between each uni-modal encoder and the cross-modal encoder to enable comprehensive and detailed interaction at each layer of the cross-modal encoder.
@@ -116,9 +118,6 @@ The following example shows how to run masked language modeling using [`BridgeTo
 .a cat looking out of the window.
 ```
 
-This model was contributed by [Anahita Bhiwandiwalla](https://huggingface.co/anahita-b), [Tiep Le](https://huggingface.co/Tile) and [Shaoyen Tseng](https://huggingface.co/shaoyent). The original code can be found [here](https://github.com/microsoft/BridgeTower).
-
-
 Tips:
 
 - This implementation of BridgeTower uses [`RobertaTokenizer`] to generate text embeddings and OpenAI's CLIP/ViT model to compute visual embeddings.
diff --git a/docs/source/en/model_doc/bros.md b/docs/source/en/model_doc/bros.md
index 1c8e3f50605..419e725e75e 100644
--- a/docs/source/en/model_doc/bros.md
+++ b/docs/source/en/model_doc/bros.md
@@ -31,12 +31,13 @@ AMLM is a 2D version of TMLM. It randomly masks text tokens and predicts with th
 
 BROS achieves comparable or better result on Key Information Extraction (KIE) benchmarks such as FUNSD, SROIE, CORD and SciTSR, without relying on explicit visual features.
 
-
 The abstract from the paper is the following:
 
 *Key information extraction (KIE) from document images requires understanding the contextual and spatial semantics of texts in two-dimensional (2D) space. Many recent studies try to solve the task by developing pre-trained language models focusing on combining visual features from document images with texts and their layout. On the other hand, this paper tackles the problem by going back to the basic: effective combination of text and layout. Specifically, we propose a pre-trained language model, named BROS (BERT Relying On Spatiality), that encodes relative positions of texts in 2D space and learns from unlabeled documents with area-masking strategy. With this optimized training scheme for understanding texts in 2D space, BROS shows comparable or better performance compared to previous methods on four KIE benchmarks (FUNSD, SROIE*, CORD, and SciTSR) without relying on visual features. This paper also reveals two real-world challenges in KIE tasks-(1) minimizing the error from incorrect text ordering and (2) efficient learning from fewer downstream examples-and demonstrates the superiority of BROS over previous methods.*
 
-Tips:
+This model was contributed by [jinho8345](https://huggingface.co/jinho8345). The original code can be found [here](https://github.com/clovaai/bros).
+
+## Usage tips and examples
 
 - [`~transformers.BrosModel.forward`] requires `input_ids` and `bbox` (bounding box). Each bounding box should be in (x0, y0, x1, y1) format (top-left corner, bottom-right corner). Obtaining of Bounding boxes depends on external OCR system. The `x` coordinate should be normalized by document image width, and the `y` coordinate should be normalized by document image height.
 
@@ -78,9 +79,9 @@ def make_box_first_token_mask(bboxes, words, tokenizer, max_seq_length=512):
 
 ```
 
-- Demo scripts can be found [here](https://github.com/clovaai/bros).
+## Resources
 
-This model was contributed by [jinho8345](https://huggingface.co/jinho8345). The original code can be found [here](https://github.com/clovaai/bros).
+- Demo scripts can be found [here](https://github.com/clovaai/bros).
 
 ## BrosConfig
 
@@ -102,13 +103,11 @@ This model was contributed by [jinho8345](https://huggingface.co/jinho8345). The
 [[autodoc]] BrosForTokenClassification
     - forward
 
-
 ## BrosSpadeEEForTokenClassification
 
 [[autodoc]] BrosSpadeEEForTokenClassification
     - forward
 
-
 ## BrosSpadeELForTokenClassification
 
 [[autodoc]] BrosSpadeELForTokenClassification
diff --git a/docs/source/en/model_doc/byt5.md b/docs/source/en/model_doc/byt5.md
index 2df7c4ddaa2..dc2942e33bb 100644
--- a/docs/source/en/model_doc/byt5.md
+++ b/docs/source/en/model_doc/byt5.md
@@ -40,14 +40,18 @@ experiments.*
 This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The original code can be
 found [here](https://github.com/google-research/byt5).
 
-ByT5's architecture is based on the T5v1.1 model, so one can refer to [T5v1.1's documentation page](t5v1.1). They
+<Tip>
+
+ByT5's architecture is based on the T5v1.1 model, refer to [T5v1.1's documentation page](t5v1.1) for the API reference. They
 only differ in how inputs should be prepared for the model, see the code examples below.
 
+</Tip>
+
 Since ByT5 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
 fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
 
 
-### Example
+## Usage example
 
 ByT5 works on raw UTF-8 bytes, so it can be used without a tokenizer:
 
diff --git a/docs/source/en/model_doc/camembert.md b/docs/source/en/model_doc/camembert.md
index 3ec4cd5dd0b..dc217fe619b 100644
--- a/docs/source/en/model_doc/camembert.md
+++ b/docs/source/en/model_doc/camembert.md
@@ -34,14 +34,16 @@ dependency parsing, named-entity recognition, and natural language inference. Ca
 for most of the tasks considered. We release the pretrained model for CamemBERT hoping to foster research and
 downstream applications for French NLP.*
 
-Tips:
-
-- This implementation is the same as RoBERTa. Refer to the [documentation of RoBERTa](roberta) for usage examples
-  as well as the information relative to the inputs and outputs.
-
 This model was contributed by [camembert](https://huggingface.co/camembert). The original code can be found [here](https://camembert-model.fr/).
 
-## Documentation resources
+<Tip>
+
+This implementation is the same as RoBERTa. Refer to the [documentation of RoBERTa](roberta) for usage examples as well 
+as the information relative to the inputs and outputs.
+
+</Tip>
+
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
@@ -66,6 +68,9 @@ This model was contributed by [camembert](https://huggingface.co/camembert). The
 
 [[autodoc]] CamembertTokenizerFast
 
+<frameworkcontent>
+<pt>
+
 ## CamembertModel
 
 [[autodoc]] CamembertModel
@@ -94,6 +99,9 @@ This model was contributed by [camembert](https://huggingface.co/camembert). The
 
 [[autodoc]] CamembertForQuestionAnswering
 
+</pt>
+<tf>
+
 ## TFCamembertModel
 
 [[autodoc]] TFCamembertModel
@@ -121,3 +129,7 @@ This model was contributed by [camembert](https://huggingface.co/camembert). The
 ## TFCamembertForQuestionAnswering
 
 [[autodoc]] TFCamembertForQuestionAnswering
+
+</tf>
+</frameworkcontent>
+
diff --git a/docs/source/en/model_doc/canine.md b/docs/source/en/model_doc/canine.md
index 748ec63eccc..7729d8aa91d 100644
--- a/docs/source/en/model_doc/canine.md
+++ b/docs/source/en/model_doc/canine.md
@@ -37,7 +37,9 @@ To use its finer-grained input effectively and efficiently, CANINE combines down
 sequence length, with a deep transformer stack, which encodes context. CANINE outperforms a comparable mBERT model by
 2.8 F1 on TyDi QA, a challenging multilingual benchmark, despite having 28% fewer model parameters.*
 
-Tips:
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/google-research/language/tree/master/language/canine).
+
+## Usage tips
 
 - CANINE uses no less than 3 Transformer encoders internally: 2 "shallow" encoders (which only consist of a single
   layer) and 1 "deep" encoder (which is a regular BERT encoder). First, a "shallow" encoder is used to contextualize
@@ -50,19 +52,18 @@ Tips:
   (which has a predefined Unicode code point). For token classification tasks however, the downsampled sequence of
   tokens needs to be upsampled again to match the length of the original character sequence (which is 2048). The
   details for this can be found in the paper.
--  Models:
+
+Model checkpoints:
 
   - [google/canine-c](https://huggingface.co/google/canine-c): Pre-trained with autoregressive character loss,
     12-layer, 768-hidden, 12-heads, 121M parameters (size ~500 MB).
   - [google/canine-s](https://huggingface.co/google/canine-s): Pre-trained with subword loss, 12-layer,
     768-hidden, 12-heads, 121M parameters (size ~500 MB).
 
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/google-research/language/tree/master/language/canine).
 
+## Usage example
 
-### Example
-
-CANINE works on raw characters, so it can be used without a tokenizer:
+CANINE works on raw characters, so it can be used **without a tokenizer**:
 
 ```python
 >>> from transformers import CanineModel
@@ -96,17 +97,13 @@ sequences to the same length):
 >>> sequence_output = outputs.last_hidden_state
 ```
 
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
 - [Question answering task guide](../tasks/question_answering)
 - [Multiple choice task guide](../tasks/multiple_choice)
 
-## CANINE specific outputs
-
-[[autodoc]] models.canine.modeling_canine.CanineModelOutputWithPooling
-
 ## CanineConfig
 
 [[autodoc]] CanineConfig
@@ -118,6 +115,10 @@ sequences to the same length):
     - get_special_tokens_mask
     - create_token_type_ids_from_sequences
 
+## CANINE specific outputs
+
+[[autodoc]] models.canine.modeling_canine.CanineModelOutputWithPooling
+
 ## CanineModel
 
 [[autodoc]] CanineModel
diff --git a/docs/source/en/model_doc/chinese_clip.md b/docs/source/en/model_doc/chinese_clip.md
index 430a734014c..b2d27a844e9 100644
--- a/docs/source/en/model_doc/chinese_clip.md
+++ b/docs/source/en/model_doc/chinese_clip.md
@@ -25,7 +25,9 @@ The abstract from the paper is the following:
 
 *The tremendous success of CLIP (Radford et al., 2021) has promoted the research and application of contrastive learning for vision-language pretraining. In this work, we construct a large-scale dataset of image-text pairs in Chinese, where most data are retrieved from publicly available datasets, and we pretrain Chinese CLIP models on the new dataset. We develop 5 Chinese CLIP models of multiple sizes, spanning from 77 to 958 million parameters. Furthermore, we propose a two-stage pretraining method, where the model is first trained with the image encoder frozen and then trained with all parameters being optimized, to achieve enhanced model performance. Our comprehensive experiments demonstrate that Chinese CLIP can achieve the state-of-the-art performance on MUGE, Flickr30K-CN, and COCO-CN in the setups of zero-shot learning and finetuning, and it is able to achieve competitive performance in zero-shot image classification based on the evaluation on the ELEVATER benchmark (Li et al., 2022). Our codes, pretrained models, and demos have been released.*
 
-## Usage
+The Chinese-CLIP model was contributed by [OFA-Sys](https://huggingface.co/OFA-Sys).
+
+## Usage example
 
 The code snippet below shows how to compute image & text features and similarities:
 
@@ -59,15 +61,13 @@ The code snippet below shows how to compute image & text features and similariti
 >>> probs = logits_per_image.softmax(dim=1)  # probs: [[1.2686e-03, 5.4499e-02, 6.7968e-04, 9.4355e-01]]
 ```
 
-Currently, we release the following scales of pretrained Chinese-CLIP models at HF Model Hub:
+Currently, following scales of pretrained Chinese-CLIP models are available on 🤗 Hub:
 
 - [OFA-Sys/chinese-clip-vit-base-patch16](https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16)
 - [OFA-Sys/chinese-clip-vit-large-patch14](https://huggingface.co/OFA-Sys/chinese-clip-vit-large-patch14)
 - [OFA-Sys/chinese-clip-vit-large-patch14-336px](https://huggingface.co/OFA-Sys/chinese-clip-vit-large-patch14-336px)
 - [OFA-Sys/chinese-clip-vit-huge-patch14](https://huggingface.co/OFA-Sys/chinese-clip-vit-huge-patch14)
 
-The Chinese-CLIP model was contributed by [OFA-Sys](https://huggingface.co/OFA-Sys). 
-
 ## ChineseCLIPConfig
 
 [[autodoc]] ChineseCLIPConfig
diff --git a/docs/source/en/model_doc/clap.md b/docs/source/en/model_doc/clap.md
index 54082ec8aad..7bfc75e23c3 100644
--- a/docs/source/en/model_doc/clap.md
+++ b/docs/source/en/model_doc/clap.md
@@ -30,7 +30,6 @@ The abstract from the paper is the following:
 This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) and [Arthur Zucker](https://huggingface.co/ArtZucker) .
 The original code can be found [here](https://github.com/LAION-AI/Clap).
 
-
 ## ClapConfig
 
 [[autodoc]] ClapConfig
@@ -78,4 +77,3 @@ The original code can be found [here](https://github.com/LAION-AI/Clap).
 
 [[autodoc]] ClapAudioModelWithProjection
     - forward
-
diff --git a/docs/source/en/model_doc/clip.md b/docs/source/en/model_doc/clip.md
index 29b074f1cbb..ed4fd8df789 100644
--- a/docs/source/en/model_doc/clip.md
+++ b/docs/source/en/model_doc/clip.md
@@ -40,7 +40,9 @@ for any dataset specific training. For instance, we match the accuracy of the or
 without needing to use any of the 1.28 million training examples it was trained on. We release our code and pre-trained
 model weights at this https URL.*
 
-## Usage
+This model was contributed by [valhalla](https://huggingface.co/valhalla). The original code can be found [here](https://github.com/openai/CLIP).
+
+## Usage tips and example
 
 CLIP is a multi-modal vision and language model. It can be used for image-text similarity and for zero-shot image
 classification. CLIP uses a ViT like transformer to get visual features and a causal language model to get the text
@@ -77,8 +79,6 @@ encode the text and prepare the images. The following example shows how to get t
 >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
 ```
 
-This model was contributed by [valhalla](https://huggingface.co/valhalla). The original code can be found [here](https://github.com/openai/CLIP).
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with CLIP.
@@ -142,6 +142,9 @@ The resource should ideally demonstrate something new instead of duplicating an
 
 [[autodoc]] CLIPProcessor
 
+<frameworkcontent>
+<pt>
+
 ## CLIPModel
 
 [[autodoc]] CLIPModel
@@ -164,12 +167,14 @@ The resource should ideally demonstrate something new instead of duplicating an
 [[autodoc]] CLIPVisionModelWithProjection
     - forward
 
-
 ## CLIPVisionModel
 
 [[autodoc]] CLIPVisionModel
     - forward
 
+</pt>
+<tf>
+
 ## TFCLIPModel
 
 [[autodoc]] TFCLIPModel
@@ -187,6 +192,9 @@ The resource should ideally demonstrate something new instead of duplicating an
 [[autodoc]] TFCLIPVisionModel
     - call
 
+</tf>
+<jax>
+
 ## FlaxCLIPModel
 
 [[autodoc]] FlaxCLIPModel
@@ -208,3 +216,6 @@ The resource should ideally demonstrate something new instead of duplicating an
 
 [[autodoc]] FlaxCLIPVisionModel
     - __call__
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/clipseg.md b/docs/source/en/model_doc/clipseg.md
index c4c60a48d05..320095bc190 100644
--- a/docs/source/en/model_doc/clipseg.md
+++ b/docs/source/en/model_doc/clipseg.md
@@ -41,13 +41,6 @@ to any binary segmentation task where a text or image query
 can be formulated. Finally, we find our system to adapt well
 to generalized queries involving affordances or properties*
 
-Tips:
-
-- [`CLIPSegForImageSegmentation`] adds a decoder on top of [`CLIPSegModel`]. The latter is identical to [`CLIPModel`].
-- [`CLIPSegForImageSegmentation`] can generate image segmentations based on arbitrary prompts at test time. A prompt can be either a text
-(provided to the model as `input_ids`) or an image (provided to the model as `conditional_pixel_values`). One can also provide custom
-conditional embeddings (provided to the model as `conditional_embeddings`).
-
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/clipseg_architecture.png"
 alt="drawing" width="600"/> 
 
@@ -56,6 +49,13 @@ alt="drawing" width="600"/>
 This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/timojl/clipseg).
 
+## Usage tips
+
+- [`CLIPSegForImageSegmentation`] adds a decoder on top of [`CLIPSegModel`]. The latter is identical to [`CLIPModel`].
+- [`CLIPSegForImageSegmentation`] can generate image segmentations based on arbitrary prompts at test time. A prompt can be either a text
+(provided to the model as `input_ids`) or an image (provided to the model as `conditional_pixel_values`). One can also provide custom
+conditional embeddings (provided to the model as `conditional_embeddings`).
+
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with CLIPSeg. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
diff --git a/docs/source/en/model_doc/code_llama.md b/docs/source/en/model_doc/code_llama.md
index a60cf164153..38d50c87334 100644
--- a/docs/source/en/model_doc/code_llama.md
+++ b/docs/source/en/model_doc/code_llama.md
@@ -24,7 +24,11 @@ The abstract from the paper is the following:
 
 *We release Code Llama, a family of large language models for code based on Llama 2 providing state-of-the-art performance among open models, infilling capabilities, support for large input contexts, and zero-shot instruction following ability for programming tasks. We provide multiple flavors to cover a wide range of applications: foundation models (Code Llama), Python specializations (Code Llama - Python), and instruction-following models (Code Llama - Instruct) with 7B, 13B and 34B parameters each. All models are trained on sequences of 16k tokens and show improvements on inputs with up to 100k tokens. 7B and 13B Code Llama and Code Llama - Instruct variants support infilling based on surrounding content. Code Llama reaches state-of-the-art performance among open models on several code benchmarks, with scores of up to 53% and 55% on HumanEval and MBPP, respectively. Notably, Code Llama - Python 7B outperforms Llama 2 70B on HumanEval and MBPP, and all our models outperform every other publicly available model on MultiPL-E. We release Code Llama under a permissive license that allows for both research and commercial use.*
 
-Check out all Code Llama models [here](https://huggingface.co/models?search=code_llama) and the officially released ones in the [codellama org](https://huggingface.co/codellama).
+Check out all Code Llama model checkpoints [here](https://huggingface.co/models?search=code_llama) and the officially released ones in the [codellama org](https://huggingface.co/codellama).
+
+This model was contributed by [ArthurZucker](https://huggingface.co/ArthurZ). The original code of the authors can be found [here](https://github.com/facebookresearch/llama).
+
+## Usage tips and examples
 
 <Tip warning={true}>
 
@@ -38,21 +42,22 @@ As mentioned above, the `dtype` of the storage weights is mostly irrelevant unle
 
 </Tip>
 
-Tips:
 
-- These models have the same architecture as the `Llama2` models
+Tips:
 - The infilling task is supported out of the box. You should be using the `tokenizer.fill_token` where you want your input to be filled.
 - The model conversion script is the same as for the `Llama2` family:
 
-Here is a sample usage
+Here is a sample usage:
+
 ```bash
 python src/transformers/models/llama/convert_llama_weights_to_hf.py \
     --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
 ```
+
 Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions
 come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
 
-- After conversion, the model and tokenizer can be loaded via:
+After conversion, the model and tokenizer can be loaded via:
 
 ```python
 >>> from transformers import LlamaForCausalLM, CodeLlamaTokenizer
@@ -95,9 +100,13 @@ If you only want the infilled part:
 
 Under the hood, the tokenizer [automatically splits by `<FILL_ME>`](https://huggingface.co/docs/transformers/main/model_doc/code_llama#transformers.CodeLlamaTokenizer.fill_token) to create a formatted input string that follows [the original training pattern](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402). This is more robust than preparing the pattern yourself: it avoids pitfalls, such as token glueing, that are very hard to debug.  To see how much CPU and GPU memory you need for this model or others, try [this calculator](https://huggingface.co/spaces/hf-accelerate/model-memory-usage) which can help determine that value.
 
-- The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string.
+The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string.
 
-This model was contributed by [ArthurZucker](https://huggingface.co/ArthurZ). The original code of the authors can be found [here](https://github.com/facebookresearch/llama).
+<Tip>
+
+Code Llama has the same architecture as the `Llama2` models, refer to [Llama2's documentation page](llama2) for the API reference.
+Find Code Llama tokenizer reference below. 
+</Tip>
 
 
 ## CodeLlamaTokenizer
diff --git a/docs/source/en/model_doc/codegen.md b/docs/source/en/model_doc/codegen.md
index 695f45f9ae1..78be813db1a 100644
--- a/docs/source/en/model_doc/codegen.md
+++ b/docs/source/en/model_doc/codegen.md
@@ -40,7 +40,7 @@ The original code can be found [here](https://github.com/salesforce/codegen).
     * `mono`: Initialized with `multi`, then further pre-trained on Python data
 * For example, `Salesforce/codegen-350M-mono` offers a 350 million-parameter checkpoint pre-trained sequentially on the Pile, multiple programming languages, and Python.
 
-## How to use
+## Usage example
 
 ```python
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -60,7 +60,7 @@ def hello_world():
 hello_world()
 ```
 
-## Documentation resources
+## Resources
 
 - [Causal language modeling task guide](../tasks/language_modeling)
 
diff --git a/docs/source/en/model_doc/conditional_detr.md b/docs/source/en/model_doc/conditional_detr.md
index 8993fb38431..516e1c43685 100644
--- a/docs/source/en/model_doc/conditional_detr.md
+++ b/docs/source/en/model_doc/conditional_detr.md
@@ -31,7 +31,7 @@ alt="drawing" width="600"/>
 
 This model was contributed by [DepuMeng](https://huggingface.co/DepuMeng). The original code can be found [here](https://github.com/Atten4Vis/ConditionalDETR).
 
-## Documentation resources
+## Resources
 
 - [Object detection task guide](../tasks/object_detection)
 
diff --git a/docs/source/en/model_doc/convbert.md b/docs/source/en/model_doc/convbert.md
index 8a0aa7a946c..17b5d7920c6 100644
--- a/docs/source/en/model_doc/convbert.md
+++ b/docs/source/en/model_doc/convbert.md
@@ -44,12 +44,14 @@ ConvBERT significantly outperforms BERT and its variants in various downstream t
 fewer model parameters. Remarkably, ConvBERTbase model achieves 86.4 GLUE score, 0.7 higher than ELECTRAbase, while
 using less than 1/4 training cost. Code and pre-trained models will be released.*
 
-ConvBERT training tips are similar to those of BERT.
-
 This model was contributed by [abhishek](https://huggingface.co/abhishek). The original implementation can be found
 here: https://github.com/yitu-opensource/ConvBert
 
-## Documentation resources
+## Usage tips
+
+ConvBERT training tips are similar to those of BERT. For usage tips refer to [BERT documentation](bert).
+
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
@@ -73,6 +75,9 @@ here: https://github.com/yitu-opensource/ConvBert
 
 [[autodoc]] ConvBertTokenizerFast
 
+<frameworkcontent>
+<pt>
+
 ## ConvBertModel
 
 [[autodoc]] ConvBertModel
@@ -103,6 +108,9 @@ here: https://github.com/yitu-opensource/ConvBert
 [[autodoc]] ConvBertForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFConvBertModel
 
 [[autodoc]] TFConvBertModel
@@ -132,3 +140,6 @@ here: https://github.com/yitu-opensource/ConvBert
 
 [[autodoc]] TFConvBertForQuestionAnswering
     - call
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/convnext.md b/docs/source/en/model_doc/convnext.md
index acbb0265b2e..5222834b1f6 100644
--- a/docs/source/en/model_doc/convnext.md
+++ b/docs/source/en/model_doc/convnext.md
@@ -32,10 +32,6 @@ of a vision Transformer, and discover several key components that contribute to
 dubbed ConvNeXt. Constructed entirely from standard ConvNet modules, ConvNeXts compete favorably with Transformers in terms of accuracy and scalability, achieving 87.8% ImageNet top-1 accuracy
 and outperforming Swin Transformers on COCO detection and ADE20K segmentation, while maintaining the simplicity and efficiency of standard ConvNets.*
 
-Tips:
-
-- See the code examples below each model regarding usage.
-
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/convnext_architecture.jpg"
 alt="drawing" width="600"/>
 
@@ -68,6 +64,9 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] ConvNextImageProcessor
     - preprocess
 
+<frameworkcontent>
+<pt>
+
 ## ConvNextModel
 
 [[autodoc]] ConvNextModel
@@ -78,14 +77,18 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] ConvNextForImageClassification
     - forward
 
+</pt>
+<tf>
 
 ## TFConvNextModel
 
 [[autodoc]] TFConvNextModel
     - call
 
-
 ## TFConvNextForImageClassification
 
 [[autodoc]] TFConvNextForImageClassification
     - call
+
+</tf>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/en/model_doc/convnextv2.md b/docs/source/en/model_doc/convnextv2.md
index af08128c45e..8cd142c2765 100644
--- a/docs/source/en/model_doc/convnextv2.md
+++ b/docs/source/en/model_doc/convnextv2.md
@@ -25,10 +25,6 @@ The abstract from the paper is the following:
 
 *Driven by improved architectures and better representation learning frameworks, the field of visual recognition has enjoyed rapid modernization and performance boost in the early 2020s. For example, modern ConvNets, represented by ConvNeXt, have demonstrated strong performance in various scenarios. While these models were originally designed for supervised learning with ImageNet labels, they can also potentially benefit from self-supervised learning techniques such as masked  autoencoders (MAE). However, we found that simply combining these two approaches leads to subpar performance. In this paper, we propose a fully convolutional masked autoencoder framework and a new Global Response Normalization (GRN) layer that can be added to the ConvNeXt architecture to enhance inter-channel feature competition. This co-design of self-supervised learning techniques and architectural improvement results in a new model family called ConvNeXt V2, which significantly improves the performance of pure ConvNets on various recognition benchmarks, including ImageNet classification, COCO detection, and ADE20K segmentation. We also provide pre-trained ConvNeXt V2 models of various sizes, ranging from an efficient 3.7M-parameter Atto model with 76.7% top-1 accuracy on ImageNet, to a 650M Huge model that achieves a state-of-the-art 88.9% accuracy using only public training data.*
 
-Tips:
-
-- See the code examples below each model regarding usage.
-
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/convnextv2_architecture.png"
 alt="drawing" width="600"/>
 
diff --git a/docs/source/en/model_doc/cpm.md b/docs/source/en/model_doc/cpm.md
index a2ecf1a1e09..129c4ed3a37 100644
--- a/docs/source/en/model_doc/cpm.md
+++ b/docs/source/en/model_doc/cpm.md
@@ -37,7 +37,14 @@ NLP tasks in the settings of few-shot (even zero-shot) learning.*
 This model was contributed by [canwenxu](https://huggingface.co/canwenxu). The original implementation can be found
 here: https://github.com/TsinghuaAI/CPM-Generate
 
-Note: We only have a tokenizer here, since the model architecture is the same as GPT-2.
+
+<Tip>
+
+CPM's architecture is the same as GPT-2, except for tokenization method. Refer to [GPT-2 documentation](gpt2) for 
+API reference information.  
+
+</Tip>
+
 
 ## CpmTokenizer
 
diff --git a/docs/source/en/model_doc/cpmant.md b/docs/source/en/model_doc/cpmant.md
index 2c4ad92a629..4bcf774507f 100644
--- a/docs/source/en/model_doc/cpmant.md
+++ b/docs/source/en/model_doc/cpmant.md
@@ -20,11 +20,10 @@ rendered properly in your Markdown viewer.
 
 CPM-Ant is an open-source Chinese pre-trained language model (PLM) with 10B parameters. It is also the first milestone of the live training process of CPM-Live. The training process is cost-effective and environment-friendly. CPM-Ant also achieves promising results with delta tuning on the CUGE benchmark. Besides the full model, we also provide various compressed versions to meet the requirements of different hardware configurations. [See more](https://github.com/OpenBMB/CPM-Live/tree/cpm-ant/cpm-live)
 
-Tips:
-
 This model was contributed by [OpenBMB](https://huggingface.co/openbmb). The original code can be found [here](https://github.com/OpenBMB/CPM-Live/tree/cpm-ant/cpm-live).
 
-⚙️ Training & Inference
+## Resources
+
 - A tutorial on [CPM-Live](https://github.com/OpenBMB/CPM-Live/tree/cpm-ant/cpm-live).
 
 ## CpmAntConfig
diff --git a/docs/source/en/model_doc/ctrl.md b/docs/source/en/model_doc/ctrl.md
index 9c2413d2776..be9fa85c707 100644
--- a/docs/source/en/model_doc/ctrl.md
+++ b/docs/source/en/model_doc/ctrl.md
@@ -41,7 +41,10 @@ providing more explicit control over text generation. These codes also allow CTR
 training data are most likely given a sequence. This provides a potential method for analyzing large amounts of data
 via model-based source attribution.*
 
-Tips:
+This model was contributed by [keskarnitishr](https://huggingface.co/keskarnitishr). The original code can be found
+[here](https://github.com/salesforce/ctrl).
+
+## Usage tips
 
 - CTRL makes use of control codes to generate text: it requires generations to be started by certain words, sentences
   or links to generate coherent text. Refer to the [original implementation](https://github.com/salesforce/ctrl) for
@@ -56,10 +59,8 @@ Tips:
   pre-computed values in the context of text generation. See the [`forward`](model_doc/ctrl#transformers.CTRLModel.forward)
   method for more information on the usage of this argument.
 
-This model was contributed by [keskarnitishr](https://huggingface.co/keskarnitishr). The original code can be found
-[here](https://github.com/salesforce/ctrl).
 
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Causal language modeling task guide](../tasks/language_modeling)
@@ -73,6 +74,9 @@ This model was contributed by [keskarnitishr](https://huggingface.co/keskarnitis
 [[autodoc]] CTRLTokenizer
     - save_vocabulary
 
+<frameworkcontent>
+<pt>
+
 ## CTRLModel
 
 [[autodoc]] CTRLModel
@@ -88,6 +92,9 @@ This model was contributed by [keskarnitishr](https://huggingface.co/keskarnitis
 [[autodoc]] CTRLForSequenceClassification
     - forward
 
+</pt>
+<tf>
+
 ## TFCTRLModel
 
 [[autodoc]] TFCTRLModel
@@ -102,3 +109,6 @@ This model was contributed by [keskarnitishr](https://huggingface.co/keskarnitis
 
 [[autodoc]] TFCTRLForSequenceClassification
     - call
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/cvt.md b/docs/source/en/model_doc/cvt.md
index 6c9aea5ec86..503f97795c0 100644
--- a/docs/source/en/model_doc/cvt.md
+++ b/docs/source/en/model_doc/cvt.md
@@ -33,15 +33,15 @@ performance gains are maintained when pretrained on larger datasets (\eg ImageNe
 ImageNet-22k, our CvT-W24 obtains a top-1 accuracy of 87.7\% on the ImageNet-1k val set. Finally, our results show that the positional encoding, 
 a crucial component in existing Vision Transformers, can be safely removed in our model, simplifying the design for higher resolution vision tasks.*
 
-Tips:
+This model was contributed by [anugunj](https://huggingface.co/anugunj). The original code can be found [here](https://github.com/microsoft/CvT).
+
+## Usage tips
 
 - CvT models are regular Vision Transformers, but trained with convolutions. They outperform the [original model (ViT)](vit) when fine-tuned on ImageNet-1K and CIFAR-100.
 - You can check out demo notebooks regarding inference as well as fine-tuning on custom data [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer) (you can just replace [`ViTFeatureExtractor`] by [`AutoImageProcessor`] and [`ViTForImageClassification`] by [`CvtForImageClassification`]).
 - The available checkpoints are either (1) pre-trained on [ImageNet-22k](http://www.image-net.org/) (a collection of 14 million images and 22k classes) only, (2) also fine-tuned on ImageNet-22k or (3) also fine-tuned on [ImageNet-1k](http://www.image-net.org/challenges/LSVRC/2012/) (also referred to as ILSVRC 2012, a collection of 1.3 million
   images and 1,000 classes).
 
-This model was contributed by [anugunj](https://huggingface.co/anugunj). The original code can be found [here](https://github.com/microsoft/CvT).
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with CvT.
@@ -57,6 +57,9 @@ If you're interested in submitting a resource to be included here, please feel f
 
 [[autodoc]] CvtConfig
 
+<frameworkcontent>
+<pt>
+
 ## CvtModel
 
 [[autodoc]] CvtModel
@@ -67,6 +70,9 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] CvtForImageClassification
     - forward
 
+</pt>
+<tf>
+
 ## TFCvtModel
 
 [[autodoc]] TFCvtModel
@@ -77,3 +83,5 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] TFCvtForImageClassification
     - call
 
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/data2vec.md b/docs/source/en/model_doc/data2vec.md
index dc05c44be90..517a51ce46a 100644
--- a/docs/source/en/model_doc/data2vec.md
+++ b/docs/source/en/model_doc/data2vec.md
@@ -35,19 +35,18 @@ the entire input. Experiments on the major benchmarks of speech recognition, ima
 natural language understanding demonstrate a new state of the art or competitive performance to predominant approaches.
 Models and code are available at www.github.com/pytorch/fairseq/tree/master/examples/data2vec.*
 
-Tips:
-
-- Data2VecAudio, Data2VecText, and Data2VecVision have all been trained using the same self-supervised learning method.
-- For Data2VecAudio, preprocessing is identical to [`Wav2Vec2Model`], including feature extraction
-- For Data2VecText, preprocessing is identical to [`RobertaModel`], including tokenization.
-- For Data2VecVision, preprocessing is identical to [`BeitModel`], including feature extraction.
-
 This model was contributed by [edugp](https://huggingface.co/edugp) and [patrickvonplaten](https://huggingface.co/patrickvonplaten).
 [sayakpaul](https://github.com/sayakpaul) and [Rocketknight1](https://github.com/Rocketknight1) contributed Data2Vec for vision in TensorFlow.
 
 The original code (for NLP and Speech) can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/data2vec).
 The original code for vision can be found [here](https://github.com/facebookresearch/data2vec_vision/tree/main/beit).
 
+## Usage tips
+
+- Data2VecAudio, Data2VecText, and Data2VecVision have all been trained using the same self-supervised learning method.
+- For Data2VecAudio, preprocessing is identical to [`Wav2Vec2Model`], including feature extraction
+- For Data2VecText, preprocessing is identical to [`RobertaModel`], including tokenization.
+- For Data2VecVision, preprocessing is identical to [`BeitModel`], including feature extraction.
 
 ## Resources
 
@@ -88,6 +87,8 @@ If you're interested in submitting a resource to be included here, please feel f
 
 [[autodoc]] Data2VecVisionConfig
 
+<frameworkcontent>
+<pt>
 
 ## Data2VecAudioModel
 
@@ -164,6 +165,9 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] Data2VecVisionForSemanticSegmentation
     - forward
 
+</pt>
+<tf>
+
 ## TFData2VecVisionModel
 
 [[autodoc]] TFData2VecVisionModel
@@ -178,3 +182,6 @@ If you're interested in submitting a resource to be included here, please feel f
 
 [[autodoc]] TFData2VecVisionForSemanticSegmentation
     - call
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/deberta-v2.md b/docs/source/en/model_doc/deberta-v2.md
index 8dec57a1717..e3bd91e8e4f 100644
--- a/docs/source/en/model_doc/deberta-v2.md
+++ b/docs/source/en/model_doc/deberta-v2.md
@@ -62,7 +62,7 @@ New in v2:
 This model was contributed by [DeBERTa](https://huggingface.co/DeBERTa). This model TF 2.0 implementation was
 contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/microsoft/DeBERTa).
 
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
@@ -88,6 +88,9 @@ contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code
     - build_inputs_with_special_tokens
     - create_token_type_ids_from_sequences
 
+<frameworkcontent>
+<pt>
+
 ## DebertaV2Model
 
 [[autodoc]] DebertaV2Model
@@ -123,6 +126,9 @@ contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code
 [[autodoc]] DebertaV2ForMultipleChoice
     - forward
 
+</pt>
+<tf>
+
 ## TFDebertaV2Model
 
 [[autodoc]] TFDebertaV2Model
@@ -157,3 +163,6 @@ contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code
 
 [[autodoc]] TFDebertaV2ForMultipleChoice
     - call
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/deberta.md b/docs/source/en/model_doc/deberta.md
index ed66364a4b5..342a3bc4796 100644
--- a/docs/source/en/model_doc/deberta.md
+++ b/docs/source/en/model_doc/deberta.md
@@ -94,6 +94,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
     - build_inputs_with_special_tokens
     - create_token_type_ids_from_sequences
 
+<frameworkcontent>
+<pt>
+
 ## DebertaModel
 
 [[autodoc]] DebertaModel
@@ -123,6 +126,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] DebertaForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFDebertaModel
 
 [[autodoc]] TFDebertaModel
@@ -152,3 +158,7 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] TFDebertaForQuestionAnswering
     - call
+
+</tf>
+</frameworkcontent>
+
diff --git a/docs/source/en/model_doc/decision_transformer.md b/docs/source/en/model_doc/decision_transformer.md
index a46673d87ac..07ef2ecbdc8 100644
--- a/docs/source/en/model_doc/decision_transformer.md
+++ b/docs/source/en/model_doc/decision_transformer.md
@@ -33,9 +33,7 @@ This allows us to draw upon the simplicity and scalability of the Transformer ar
  Decision Transformer matches or exceeds the performance of state-of-the-art model-free offline RL baselines on 
  Atari, OpenAI Gym, and Key-to-Door tasks.*
 
-Tips:
-
-This version of the model is for tasks where the state is a vector, image-based states will come soon.
+This version of the model is for tasks where the state is a vector.
 
 This model was contributed by [edbeeching](https://huggingface.co/edbeeching). The original code can be found [here](https://github.com/kzl/decision-transformer).
 
diff --git a/docs/source/en/model_doc/deformable_detr.md b/docs/source/en/model_doc/deformable_detr.md
index 0bceb0bdf39..726fa0d0ca9 100644
--- a/docs/source/en/model_doc/deformable_detr.md
+++ b/docs/source/en/model_doc/deformable_detr.md
@@ -25,11 +25,6 @@ The abstract from the paper is the following:
 
 *DETR has been recently proposed to eliminate the need for many hand-designed components in object detection while demonstrating good performance. However, it suffers from slow convergence and limited feature spatial resolution, due to the limitation of Transformer attention modules in processing image feature maps. To mitigate these issues, we proposed Deformable DETR, whose attention modules only attend to a small set of key sampling points around a reference. Deformable DETR can achieve better performance than DETR (especially on small objects) with 10 times less training epochs. Extensive experiments on the COCO benchmark demonstrate the effectiveness of our approach.*
 
-Tips:
-
-- One can use [`DeformableDetrImageProcessor`] to prepare images (and optional targets) for the model.
-- Training Deformable DETR is equivalent to training the original [DETR](detr) model. See the [resources](#resources) section below for demo notebooks.
-
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/deformable_detr_architecture.png"
 alt="drawing" width="600"/>
 
@@ -37,6 +32,10 @@ alt="drawing" width="600"/>
 
 This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/fundamentalvision/Deformable-DETR).
 
+## Usage tips
+
+- Training Deformable DETR is equivalent to training the original [DETR](detr) model. See the [resources](#resources) section below for demo notebooks.
+
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Deformable DETR.
diff --git a/docs/source/en/model_doc/deit.md b/docs/source/en/model_doc/deit.md
index ef32e05ebd9..7d9918a45ee 100644
--- a/docs/source/en/model_doc/deit.md
+++ b/docs/source/en/model_doc/deit.md
@@ -16,13 +16,6 @@ rendered properly in your Markdown viewer.
 
 # DeiT
 
-<Tip>
-
-This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
-breaking changes to fix it in the future. If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title).
-
-</Tip>
-
 ## Overview
 
 The DeiT model was proposed in [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre
@@ -45,7 +38,9 @@ distillation, especially when using a convnet as a teacher. This leads us to rep
 for both Imagenet (where we obtain up to 85.2% accuracy) and when transferring to other tasks. We share our code and
 models.*
 
-Tips:
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The TensorFlow version of this model was added by [amyeroberts](https://huggingface.co/amyeroberts).
+
+## Usage tips
 
 - Compared to ViT, DeiT models use a so-called distillation token to effectively learn from a teacher (which, in the
   DeiT paper, is a ResNet like-model). The distillation token is learned through backpropagation, by interacting with
@@ -73,8 +68,6 @@ Tips:
   *facebook/deit-base-patch16-384*. Note that one should use [`DeiTImageProcessor`] in order to
   prepare images for the model.
 
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The TensorFlow version of this model was added by [amyeroberts](https://huggingface.co/amyeroberts).
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DeiT.
@@ -104,6 +97,9 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] DeiTImageProcessor
     - preprocess
 
+<frameworkcontent>
+<pt>
+
 ## DeiTModel
 
 [[autodoc]] DeiTModel
@@ -124,6 +120,9 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] DeiTForImageClassificationWithTeacher
     - forward
 
+</pt>
+<tf>
+
 ## TFDeiTModel
 
 [[autodoc]] TFDeiTModel
@@ -143,3 +142,6 @@ If you're interested in submitting a resource to be included here, please feel f
 
 [[autodoc]] TFDeiTForImageClassificationWithTeacher
     - call
+
+</tf>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/en/model_doc/deplot.md b/docs/source/en/model_doc/deplot.md
index f425a8268fd..a77bee39de7 100644
--- a/docs/source/en/model_doc/deplot.md
+++ b/docs/source/en/model_doc/deplot.md
@@ -24,12 +24,10 @@ The abstract of the paper states the following:
 
 *Visual language such as charts and plots is ubiquitous in the human world. Comprehending plots and charts requires strong reasoning skills. Prior state-of-the-art (SOTA) models require at least tens of thousands of training examples and their reasoning capabilities are still much limited, especially on complex human-written queries. This paper presents the first one-shot solution to visual language reasoning. We decompose the challenge of visual language reasoning into two steps: (1) plot-to-text translation, and (2) reasoning over the translated text. The key in this method is a modality conversion module, named as DePlot, which translates the image of a plot or chart to a linearized table. The output of DePlot can then be directly used to prompt a pretrained large language model (LLM), exploiting the few-shot reasoning capabilities of LLMs. To obtain DePlot, we standardize the plot-to-table task by establishing unified task formats and metrics, and train DePlot end-to-end on this task. DePlot can then be used off-the-shelf together with LLMs in a plug-and-play fashion. Compared with a SOTA model finetuned on more than >28k data points, DePlot+LLM with just one-shot prompting achieves a 24.0% improvement over finetuned SOTA on human-written queries from the task of chart QA.*
 
-## Model description
-
 DePlot is a model that is trained using `Pix2Struct` architecture. You can find more information about `Pix2Struct` in the [Pix2Struct documentation](https://huggingface.co/docs/transformers/main/en/model_doc/pix2struct).
 DePlot is a Visual Question Answering subset of `Pix2Struct` architecture. It renders the input question on the image and predicts the answer.
 
-## Usage
+## Usage example
 
 Currently one checkpoint is available for DePlot:
 
@@ -59,4 +57,10 @@ from transformers.optimization import Adafactor, get_cosine_schedule_with_warmup
 
 optimizer = Adafactor(self.parameters(), scale_parameter=False, relative_step=False, lr=0.01, weight_decay=1e-05)
 scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=1000, num_training_steps=40000)
-```
\ No newline at end of file
+```
+
+<Tip>
+
+DePlot is a model trained using `Pix2Struct` architecture. For API reference, see [`Pix2Struct` documentation](pix2struct).
+
+</Tip>
\ No newline at end of file
diff --git a/docs/source/en/model_doc/deta.md b/docs/source/en/model_doc/deta.md
index d384f5564e5..1eed98832ac 100644
--- a/docs/source/en/model_doc/deta.md
+++ b/docs/source/en/model_doc/deta.md
@@ -26,10 +26,6 @@ The abstract from the paper is the following:
 
 *Detection Transformer (DETR) directly transforms queries to unique objects by using one-to-one bipartite matching during training and enables end-to-end object detection. Recently, these models have surpassed traditional detectors on COCO with undeniable elegance. However, they differ from traditional detectors in multiple designs, including model architecture and training schedules, and thus the effectiveness of one-to-one matching is not fully understood. In this work, we conduct a strict comparison between the one-to-one Hungarian matching in DETRs and the one-to-many label assignments in traditional detectors with non-maximum supervision (NMS). Surprisingly, we observe one-to-many assignments with NMS consistently outperform standard one-to-one matching under the same setting, with a significant gain of up to 2.5 mAP. Our detector that trains Deformable-DETR with traditional IoU-based label assignment achieved 50.2 COCO mAP within 12 epochs (1x schedule) with ResNet50 backbone, outperforming all existing traditional or transformer-based detectors in this setting. On multiple datasets, schedules, and architectures, we consistently show bipartite matching is unnecessary for performant detection transformers. Furthermore, we attribute the success of detection transformers to their expressive transformer architecture.*
 
-Tips:
-
-- One can use [`DetaImageProcessor`] to prepare images and optional targets for the model.
-
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/deta_architecture.jpg"
 alt="drawing" width="600"/>
 
@@ -51,20 +47,17 @@ If you're interested in submitting a resource to be included here, please feel f
 
 [[autodoc]] DetaConfig
 
-
 ## DetaImageProcessor
 
 [[autodoc]] DetaImageProcessor
     - preprocess
     - post_process_object_detection
 
-
 ## DetaModel
 
 [[autodoc]] DetaModel
     - forward
 
-
 ## DetaForObjectDetection
 
 [[autodoc]] DetaForObjectDetection
diff --git a/docs/source/en/model_doc/detr.md b/docs/source/en/model_doc/detr.md
index 2c03a0f8b85..c36bd4380ed 100644
--- a/docs/source/en/model_doc/detr.md
+++ b/docs/source/en/model_doc/detr.md
@@ -41,6 +41,8 @@ baselines.*
 
 This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/facebookresearch/detr).
 
+## How DETR works
+
 Here's a TLDR explaining how [`~transformers.DetrForObjectDetection`] works:
 
 First, an image is sent through a pre-trained convolutional backbone (in the paper, the authors use
@@ -79,7 +81,7 @@ where one first trains a [`~transformers.DetrForObjectDetection`] model to detec
 the mask head for 25 epochs. Experimentally, these two approaches give similar results. Note that predicting boxes is
 required for the training to be possible, since the Hungarian matching is computed using distances between boxes.
 
-Tips:
+## Usage tips
 
 - DETR uses so-called **object queries** to detect objects in an image. The number of queries determines the maximum
   number of objects that can be detected in a single image, and is set to 100 by default (see parameter
@@ -165,14 +167,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
 
-## DETR specific outputs
-
-[[autodoc]] models.detr.modeling_detr.DetrModelOutput
-
-[[autodoc]] models.detr.modeling_detr.DetrObjectDetectionOutput
-
-[[autodoc]] models.detr.modeling_detr.DetrSegmentationOutput
-
 ## DetrConfig
 
 [[autodoc]] DetrConfig
@@ -195,6 +189,14 @@ If you're interested in submitting a resource to be included here, please feel f
     - post_process_instance_segmentation
     - post_process_panoptic_segmentation
 
+## DETR specific outputs
+
+[[autodoc]] models.detr.modeling_detr.DetrModelOutput
+
+[[autodoc]] models.detr.modeling_detr.DetrObjectDetectionOutput
+
+[[autodoc]] models.detr.modeling_detr.DetrSegmentationOutput
+
 ## DetrModel
 
 [[autodoc]] DetrModel
diff --git a/docs/source/en/model_doc/dialogpt.md b/docs/source/en/model_doc/dialogpt.md
index 70929409b29..558b91d76d2 100644
--- a/docs/source/en/model_doc/dialogpt.md
+++ b/docs/source/en/model_doc/dialogpt.md
@@ -32,7 +32,9 @@ that leverage DialoGPT generate more relevant, contentful and context-consistent
 systems. The pre-trained model and training pipeline are publicly released to facilitate research into neural response
 generation and the development of more intelligent open-domain dialogue systems.*
 
-Tips:
+The original code can be found [here](https://github.com/microsoft/DialoGPT).
+
+## Usage tips
 
 - DialoGPT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather
   than the left.
@@ -47,7 +49,8 @@ follow the OpenAI GPT-2 to model a multiturn dialogue session as a long text and
 modeling. We first concatenate all dialog turns within a dialogue session into a long text x_1,..., x_N (N is the
 sequence length), ended by the end-of-text token.* For more information please confer to the original paper.
 
+<Tip>
 
-DialoGPT's architecture is based on the GPT2 model, so one can refer to [GPT2's documentation page](gpt2).
+DialoGPT's architecture is based on the GPT2 model, refer to [GPT2's documentation page](gpt2) for API reference and examples.
 
-The original code can be found [here](https://github.com/microsoft/DialoGPT).
+</Tip>
diff --git a/docs/source/en/model_doc/dinat.md b/docs/source/en/model_doc/dinat.md
index 2317b13b7f9..23dfa3b74fb 100644
--- a/docs/source/en/model_doc/dinat.md
+++ b/docs/source/en/model_doc/dinat.md
@@ -44,17 +44,6 @@ and ADE20K (48.5 PQ), and instance segmentation model on Cityscapes (44.5 AP) an
 It also matches the state of the art specialized semantic segmentation models on ADE20K (58.2 mIoU),
 and ranks second on Cityscapes (84.5 mIoU) (no extra data). *
 
-Tips:
-- One can use the [`AutoImageProcessor`] API to prepare images for the model.
-- DiNAT can be used as a *backbone*. When `output_hidden_states = True`,
-it will output both `hidden_states` and `reshaped_hidden_states`. The `reshaped_hidden_states` have a shape of `(batch, num_channels, height, width)` rather than `(batch_size, height, width, num_channels)`.
-
-Notes:
-- DiNAT depends on [NATTEN](https://github.com/SHI-Labs/NATTEN/)'s implementation of Neighborhood Attention and Dilated Neighborhood Attention.
-You can install it with pre-built wheels for Linux by referring to [shi-labs.com/natten](https://shi-labs.com/natten), or build on your system by running `pip install natten`.
-Note that the latter will likely take time to compile. NATTEN does not support Windows devices yet.
-- Patch size of 4 is only supported at the moment.
-
 <img
 src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/dilated-neighborhood-attention-pattern.jpg"
 alt="drawing" width="600"/>
@@ -65,6 +54,17 @@ Taken from the <a href="https://arxiv.org/abs/2209.15001">original paper</a>.</s
 This model was contributed by [Ali Hassani](https://huggingface.co/alihassanijr).
 The original code can be found [here](https://github.com/SHI-Labs/Neighborhood-Attention-Transformer).
 
+## Usage tips
+
+DiNAT can be used as a *backbone*. When `output_hidden_states = True`,
+it will output both `hidden_states` and `reshaped_hidden_states`. The `reshaped_hidden_states` have a shape of `(batch, num_channels, height, width)` rather than `(batch_size, height, width, num_channels)`.
+
+Notes:
+- DiNAT depends on [NATTEN](https://github.com/SHI-Labs/NATTEN/)'s implementation of Neighborhood Attention and Dilated Neighborhood Attention.
+You can install it with pre-built wheels for Linux by referring to [shi-labs.com/natten](https://shi-labs.com/natten), or build on your system by running `pip install natten`.
+Note that the latter will likely take time to compile. NATTEN does not support Windows devices yet.
+- Patch size of 4 is only supported at the moment.
+
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DiNAT.
diff --git a/docs/source/en/model_doc/dinov2.md b/docs/source/en/model_doc/dinov2.md
index 71bdc11a9df..49a5bd3e260 100644
--- a/docs/source/en/model_doc/dinov2.md
+++ b/docs/source/en/model_doc/dinov2.md
@@ -22,14 +22,9 @@ The abstract from the paper is the following:
 
 *The recent breakthroughs in natural language processing for model pretraining on large quantities of data have opened the way for similar foundation models in computer vision. These models could greatly simplify the use of images in any system by producing all-purpose visual features, i.e., features that work across image distributions and tasks without finetuning. This work shows that existing pretraining methods, especially self-supervised methods, can produce such features if trained on enough curated data from diverse sources. We revisit existing approaches and combine different techniques to scale our pretraining in terms of data and model size. Most of the technical contributions aim at accelerating and stabilizing the training at scale. In terms of data, we propose an automatic pipeline to build a dedicated, diverse, and curated image dataset instead of uncurated data, as typically done in the self-supervised literature. In terms of models, we train a ViT model (Dosovitskiy et al., 2020) with 1B parameters and distill it into a series of smaller models that surpass the best available all-purpose features, OpenCLIP (Ilharco et al., 2021) on most of the benchmarks at image and pixel levels.*
 
-Tips:
-
-- One can use [`AutoImageProcessor`] class to prepare images for the model.
-
 This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/facebookresearch/dinov2).
 
-
 ## Dinov2Config
 
 [[autodoc]] Dinov2Config
diff --git a/docs/source/en/model_doc/distilbert.md b/docs/source/en/model_doc/distilbert.md
index 2e68119a20b..7633adcae42 100644
--- a/docs/source/en/model_doc/distilbert.md
+++ b/docs/source/en/model_doc/distilbert.md
@@ -51,7 +51,10 @@ distillation and cosine-distance losses. Our smaller, faster and lighter model i
 demonstrate its capabilities for on-device computations in a proof-of-concept experiment and a comparative on-device
 study.*
 
-Tips:
+This model was contributed by [victorsanh](https://huggingface.co/victorsanh). This model jax version was
+contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation).
+
+## Usage tips
 
 - DistilBERT doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just
   separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`).
@@ -63,8 +66,6 @@ Tips:
     * predicting the masked tokens correctly (but no next-sentence objective)
     * a cosine similarity between the hidden states of the student and the teacher model
 
-This model was contributed by [victorsanh](https://huggingface.co/victorsanh). This model jax version was
-contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation).
 
 ## Resources
 
@@ -144,6 +145,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] DistilBertTokenizerFast
 
+<frameworkcontent>
+<pt>
+
 ## DistilBertModel
 
 [[autodoc]] DistilBertModel
@@ -174,6 +178,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] DistilBertForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFDistilBertModel
 
 [[autodoc]] TFDistilBertModel
@@ -204,6 +211,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] TFDistilBertForQuestionAnswering
     - call
 
+</tf>
+<jax>
+
 ## FlaxDistilBertModel
 
 [[autodoc]] FlaxDistilBertModel
@@ -233,3 +243,10 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] FlaxDistilBertForQuestionAnswering
     - __call__
+
+</jax>
+</frameworkcontent>
+
+
+
+
diff --git a/docs/source/en/model_doc/dit.md b/docs/source/en/model_doc/dit.md
index 7d5f873e78b..7f6691a15bc 100644
--- a/docs/source/en/model_doc/dit.md
+++ b/docs/source/en/model_doc/dit.md
@@ -37,6 +37,10 @@ alt="drawing" width="600"/>
 
 <small> Summary of the approach. Taken from the [original paper](https://arxiv.org/abs/2203.02378). </small>
 
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/dit).
+
+## Usage tips
+
 One can directly use the weights of DiT with the AutoModel API:
 
 ```python
@@ -66,10 +70,6 @@ model = AutoModelForImageClassification.from_pretrained("microsoft/dit-base-fine
 This particular checkpoint was fine-tuned on [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/), an important benchmark for document image classification.
 A notebook that illustrates inference for document image classification can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DiT/Inference_with_DiT_(Document_Image_Transformer)_for_document_image_classification.ipynb).
 
-As DiT's architecture is equivalent to that of BEiT, one can refer to [BEiT's documentation page](beit) for all tips, code examples and notebooks.
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/dit).
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DiT.
@@ -78,4 +78,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 - [`BeitForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
 
-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
\ No newline at end of file
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<Tip>
+
+  As DiT's architecture is equivalent to that of BEiT, one can refer to [BEiT's documentation page](beit) for all tips, code examples and notebooks.
+</Tip>
diff --git a/docs/source/en/model_doc/donut.md b/docs/source/en/model_doc/donut.md
index cfbf79972d5..6e5cfe648d0 100644
--- a/docs/source/en/model_doc/donut.md
+++ b/docs/source/en/model_doc/donut.md
@@ -34,14 +34,14 @@ alt="drawing" width="600"/>
 This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found
 [here](https://github.com/clovaai/donut).
 
-Tips:
+## Usage tips
 
 - The quickest way to get started with Donut is by checking the [tutorial
   notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Donut), which show how to use the model
   at inference time as well as fine-tuning on custom data.
 - Donut is always used within the [VisionEncoderDecoder](vision-encoder-decoder) framework.
 
-## Inference
+## Inference examples
 
 Donut's [`VisionEncoderDecoder`] model accepts images as input and makes use of
 [`~generation.GenerationMixin.generate`] to autoregressively generate text given the input image.
diff --git a/docs/source/en/model_doc/dpr.md b/docs/source/en/model_doc/dpr.md
index 10bc76b72dd..8b9f352b637 100644
--- a/docs/source/en/model_doc/dpr.md
+++ b/docs/source/en/model_doc/dpr.md
@@ -43,7 +43,8 @@ benchmarks.*
 
 This model was contributed by [lhoestq](https://huggingface.co/lhoestq). The original code can be found [here](https://github.com/facebookresearch/DPR).
 
-Tips:
+## Usage tips
+
 - DPR consists in three models:
 
     * Question encoder: encode questions as vectors
@@ -86,6 +87,9 @@ Tips:
 
 [[autodoc]] models.dpr.modeling_dpr.DPRReaderOutput
 
+<frameworkcontent>
+<pt>
+
 ## DPRContextEncoder
 
 [[autodoc]] DPRContextEncoder
@@ -101,6 +105,9 @@ Tips:
 [[autodoc]] DPRReader
     - forward
 
+</pt>
+<tf>
+
 ## TFDPRContextEncoder
 
 [[autodoc]] TFDPRContextEncoder
@@ -115,3 +122,7 @@ Tips:
 
 [[autodoc]] TFDPRReader
     - call
+
+</tf>
+</frameworkcontent>
+
diff --git a/docs/source/en/model_doc/efficientformer.md b/docs/source/en/model_doc/efficientformer.md
index 1f16f9811b7..92ba90a9e5e 100644
--- a/docs/source/en/model_doc/efficientformer.md
+++ b/docs/source/en/model_doc/efficientformer.md
@@ -56,6 +56,9 @@ The original code can be found [here](https://github.com/snap-research/Efficient
 [[autodoc]] EfficientFormerImageProcessor
     - preprocess
 
+<frameworkcontent>
+<pt>
+
 ## EfficientFormerModel
 
 [[autodoc]] EfficientFormerModel
@@ -71,6 +74,9 @@ The original code can be found [here](https://github.com/snap-research/Efficient
 [[autodoc]] EfficientFormerForImageClassificationWithTeacher
     - forward
 
+</pt>
+<tf>
+
 ## TFEfficientFormerModel
 
 [[autodoc]] TFEfficientFormerModel
@@ -85,3 +91,6 @@ The original code can be found [here](https://github.com/snap-research/Efficient
 
 [[autodoc]] TFEfficientFormerForImageClassificationWithTeacher
     - call
+
+</tf>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/en/model_doc/electra.md b/docs/source/en/model_doc/electra.md
index 26830950ae3..700c49df799 100644
--- a/docs/source/en/model_doc/electra.md
+++ b/docs/source/en/model_doc/electra.md
@@ -50,7 +50,9 @@ using 30x more compute) on the GLUE natural language understanding benchmark. Ou
 where it performs comparably to RoBERTa and XLNet while using less than 1/4 of their compute and outperforms them when
 using the same amount of compute.*
 
-Tips:
+This model was contributed by [lysandre](https://huggingface.co/lysandre). The original code can be found [here](https://github.com/google-research/electra).
+
+## Usage tips
 
 - ELECTRA is the pretraining approach, therefore there is nearly no changes done to the underlying model: BERT. The
   only change is the separation of the embedding size and the hidden size: the embedding size is generally smaller,
@@ -66,9 +68,7 @@ Tips:
   [`ElectraForPreTraining`] model (the classification head will be randomly initialized as it
   doesn't exist in the generator).
 
-This model was contributed by [lysandre](https://huggingface.co/lysandre). The original code can be found [here](https://github.com/google-research/electra).
-
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
@@ -95,6 +95,9 @@ This model was contributed by [lysandre](https://huggingface.co/lysandre). The o
 
 [[autodoc]] models.electra.modeling_tf_electra.TFElectraForPreTrainingOutput
 
+<frameworkcontent>
+<pt>
+
 ## ElectraModel
 
 [[autodoc]] ElectraModel
@@ -135,6 +138,9 @@ This model was contributed by [lysandre](https://huggingface.co/lysandre). The o
 [[autodoc]] ElectraForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFElectraModel
 
 [[autodoc]] TFElectraModel
@@ -170,6 +176,9 @@ This model was contributed by [lysandre](https://huggingface.co/lysandre). The o
 [[autodoc]] TFElectraForQuestionAnswering
     - call
 
+</tf>
+<jax>
+
 ## FlaxElectraModel
 
 [[autodoc]] FlaxElectraModel
@@ -209,3 +218,6 @@ This model was contributed by [lysandre](https://huggingface.co/lysandre). The o
 
 [[autodoc]] FlaxElectraForQuestionAnswering
     - __call__
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/encodec.md b/docs/source/en/model_doc/encodec.md
index bc7f64676ee..856f8be2b80 100644
--- a/docs/source/en/model_doc/encodec.md
+++ b/docs/source/en/model_doc/encodec.md
@@ -26,6 +26,9 @@ The abstract from the paper is the following:
 
 This model was contributed by [Matthijs](https://huggingface.co/Matthijs), [Patrick Von Platen](https://huggingface.co/patrickvonplaten) and [Arthur Zucker](https://huggingface.co/ArthurZ). 
 The original code can be found [here](https://github.com/facebookresearch/encodec).
+
+## Usage example 
+
 Here is a quick example of how to encode and decode an audio using this model:
 
 ```python 
@@ -45,7 +48,6 @@ Here is a quick example of how to encode and decode an audio using this model:
 >>> audio_values = model(inputs["input_values"], inputs["padding_mask"]).audio_values
 ```
 
-
 ## EncodecConfig
 
 [[autodoc]] EncodecConfig
diff --git a/docs/source/en/model_doc/encoder-decoder.md b/docs/source/en/model_doc/encoder-decoder.md
index 8e26a3b9e40..54c9f750647 100644
--- a/docs/source/en/model_doc/encoder-decoder.md
+++ b/docs/source/en/model_doc/encoder-decoder.md
@@ -149,20 +149,32 @@ were contributed by [ydshieh](https://github.com/ydshieh).
 
 [[autodoc]] EncoderDecoderConfig
 
+<frameworkcontent>
+<pt>
+
 ## EncoderDecoderModel
 
 [[autodoc]] EncoderDecoderModel
     - forward
     - from_encoder_decoder_pretrained
 
+</pt>
+<tf>
+
 ## TFEncoderDecoderModel
 
 [[autodoc]] TFEncoderDecoderModel
     - call
     - from_encoder_decoder_pretrained
 
+</tf>
+<jax>
+
 ## FlaxEncoderDecoderModel
 
 [[autodoc]] FlaxEncoderDecoderModel
     - __call__
     - from_encoder_decoder_pretrained
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/ernie.md b/docs/source/en/model_doc/ernie.md
index a64291a7d4f..a5110b2d7b7 100644
--- a/docs/source/en/model_doc/ernie.md
+++ b/docs/source/en/model_doc/ernie.md
@@ -23,7 +23,7 @@ including [ERNIE1.0](https://arxiv.org/abs/1904.09223), [ERNIE2.0](https://ojs.a
 
 These models are contributed by [nghuyong](https://huggingface.co/nghuyong) and the official code can be found in [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP) (in PaddlePaddle).
 
-### How to use
+### Usage example
 Take `ernie-1.0-base-zh` as an example:
 
 ```Python
@@ -32,7 +32,7 @@ tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-1.0-base-zh")
 model = AutoModel.from_pretrained("nghuyong/ernie-1.0-base-zh")
 ```
 
-### Supported Models
+### Model checkpoints
 
 |     Model Name      | Language |           Description           |
 |:-------------------:|:--------:|:-------------------------------:|
@@ -51,7 +51,7 @@ You can find all the supported models from huggingface's model hub: [huggingface
 repo: [PaddleNLP](https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers/ERNIE/contents.html)
 and [ERNIE](https://github.com/PaddlePaddle/ERNIE/blob/repro).
 
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
diff --git a/docs/source/en/model_doc/ernie_m.md b/docs/source/en/model_doc/ernie_m.md
index 83e08e09bfc..a99332cb655 100644
--- a/docs/source/en/model_doc/ernie_m.md
+++ b/docs/source/en/model_doc/ernie_m.md
@@ -25,18 +25,17 @@ Hao Tian, Hua Wu, Haifeng Wang.
 The abstract from the paper is the following:
 
 *Recent studies have demonstrated that pre-trained cross-lingual models achieve impressive performance in downstream cross-lingual tasks. This improvement benefits from learning a large amount of monolingual and parallel corpora. Although it is generally acknowledged that parallel corpora are critical for improving the model performance, existing methods are often constrained by the size of parallel corpora, especially for lowresource languages. In this paper, we propose ERNIE-M, a new training method that encourages the model to align the representation of multiple languages with monolingual corpora, to overcome the constraint that the parallel corpus size places on the model performance. Our key insight is to integrate back-translation into the pre-training process. We generate pseudo-parallel sentence pairs on a monolingual corpus to enable the learning of semantic alignments between different languages, thereby enhancing the semantic modeling of cross-lingual models. Experimental results show that ERNIE-M outperforms existing cross-lingual models and delivers new state-of-the-art results in various cross-lingual downstream tasks.*
-
-Tips:
-
-1. Ernie-M is a BERT-like model so it is a stacked Transformer Encoder.
-2. Instead of using MaskedLM for pretraining (like BERT) the authors used two novel techniques: `Cross-attention Masked Language Modeling` and `Back-translation Masked Language Modeling`. For now these two LMHead objectives are not implemented here.
-3. It is a multilingual language model.
-4. Next Sentence Prediction was not used in pretraining process.
-
-
 This model was contributed by [Susnato Dhar](https://huggingface.co/susnato). The original code can be found [here](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/paddlenlp/transformers/ernie_m).
 
-## Documentation resources
+
+## Usage tips
+
+- Ernie-M is a BERT-like model so it is a stacked Transformer Encoder.
+- Instead of using MaskedLM for pretraining (like BERT) the authors used two novel techniques: `Cross-attention Masked Language Modeling` and `Back-translation Masked Language Modeling`. For now these two LMHead objectives are not implemented here.
+- It is a multilingual language model.
+- Next Sentence Prediction was not used in pretraining process.
+
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
diff --git a/docs/source/en/model_doc/esm.md b/docs/source/en/model_doc/esm.md
index 47b25650847..46bab860ff4 100644
--- a/docs/source/en/model_doc/esm.md
+++ b/docs/source/en/model_doc/esm.md
@@ -17,6 +17,7 @@ rendered properly in your Markdown viewer.
 # ESM
 
 ## Overview
+
 This page provides code and pre-trained weights for Transformer protein language models from Meta AI's Fundamental 
 AI Research Team, providing the state-of-the-art ESMFold and ESM-2, and the previously released ESM-1b and ESM-1v.
 Transformer protein language models were introduced in the paper [Biological structure and function emerge from scaling
@@ -73,11 +74,6 @@ sequences with low perplexity that are well understood by the language model. ES
 order of magnitude faster than AlphaFold2, enabling exploration of the structural space of metagenomic
 proteins in practical timescales.*
 
-
-Tips:
-
-- ESM models are trained with a masked language modeling (MLM) objective.
-
 The original code can be found [here](https://github.com/facebookresearch/esm) and was
 was developed by the Fundamental AI Research team at Meta AI.
 ESM-1b, ESM-1v and ESM-2 were contributed to huggingface by [jasonliu](https://huggingface.co/jasonliu)
@@ -87,10 +83,12 @@ ESMFold was contributed to huggingface by [Matt](https://huggingface.co/Rocketkn
 [Sylvain](https://huggingface.co/sgugger), with a big thank you to Nikita Smetanin, Roshan Rao and Tom Sercu for their
 help throughout the process!
 
-The HuggingFace port of ESMFold uses portions of the [openfold](https://github.com/aqlaboratory/openfold) library.
-The `openfold` library is licensed under the Apache License 2.0.
+## Usage tips
 
-## Documentation resources
+- ESM models are trained with a masked language modeling (MLM) objective.
+- The HuggingFace port of ESMFold uses portions of the [openfold](https://github.com/aqlaboratory/openfold) library. The `openfold` library is licensed under the Apache License 2.0.
+
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
@@ -109,6 +107,8 @@ The `openfold` library is licensed under the Apache License 2.0.
     - create_token_type_ids_from_sequences
     - save_vocabulary
 
+<frameworkcontent>
+<pt>
 
 ## EsmModel
 
@@ -135,6 +135,9 @@ The `openfold` library is licensed under the Apache License 2.0.
 [[autodoc]] EsmForProteinFolding
     - forward
 
+</pt>
+<tf>
+
 ## TFEsmModel
 
 [[autodoc]] TFEsmModel
@@ -154,3 +157,6 @@ The `openfold` library is licensed under the Apache License 2.0.
 
 [[autodoc]] TFEsmForTokenClassification
     - call
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/flan-t5.md b/docs/source/en/model_doc/flan-t5.md
index 5d781f75b17..c0fd6b0011c 100644
--- a/docs/source/en/model_doc/flan-t5.md
+++ b/docs/source/en/model_doc/flan-t5.md
@@ -48,6 +48,10 @@ Google has released the following variants:
 
 - [google/flan-t5-xxl](https://huggingface.co/google/flan-t5-xxl).
 
-One can refer to [T5's documentation page](t5) for all tips, code examples and notebooks. As well as the FLAN-T5 model card for more details regarding training and evaluation of the model.
-
 The original checkpoints can be found [here](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints).
+
+<Tip>
+
+Refer to [T5's documentation page](t5) for all API reference, code examples and notebooks. For more details regarding training and evaluation of the FLAN-T5, refer to the model card.
+
+</Tip>
\ No newline at end of file
diff --git a/docs/source/en/model_doc/flan-ul2.md b/docs/source/en/model_doc/flan-ul2.md
index 40fad51def6..5487bb77976 100644
--- a/docs/source/en/model_doc/flan-ul2.md
+++ b/docs/source/en/model_doc/flan-ul2.md
@@ -21,7 +21,6 @@ rendered properly in your Markdown viewer.
 Flan-UL2 is an encoder decoder model based on the T5 architecture. It uses the same configuration as the [UL2](ul2) model released earlier last year. 
 It was fine tuned using the "Flan" prompt tuning and dataset collection. Similar to `Flan-T5`,  one can directly use FLAN-UL2 weights without finetuning the model:
 
-
 According to the original blog here are the notable improvements:
 
 - The original UL2 model was only trained with receptive field of 512, which made it non-ideal for N-shot prompting where N is large.
@@ -29,9 +28,6 @@ According to the original blog here are the notable improvements:
 - The original UL2 model also had mode switch tokens that was rather mandatory to get good performance. However, they were a little cumbersome as this requires often some changes during inference or finetuning. In this update/change, we continue training UL2 20B for an additional 100k steps (with small batch) to forget “mode tokens” before applying Flan instruction tuning. This Flan-UL2 checkpoint does not require mode tokens anymore.
 Google has released the following variants:
 
-
-One can refer to [T5's documentation page](t5) for all tips, code examples and notebooks. As well as the FLAN-T5 model card for more details regarding training and evaluation of the model.
-
 The original checkpoints can be found [here](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints).
 
 
@@ -51,6 +47,8 @@ The model is pretty heavy (~40GB in half precision) so if you just want to run t
 ['In a large skillet, brown the ground beef and onion over medium heat. Add the garlic']
 ```
 
-## Inference
+<Tip>
 
-The inference protocol is exactly the same as any `T5` model, please have a look at the [T5's documentation page](t5) for more details.
+Refer to [T5's documentation page](t5) for API reference, tips, code examples and notebooks. 
+
+</Tip>
diff --git a/docs/source/en/model_doc/flaubert.md b/docs/source/en/model_doc/flaubert.md
index 3e85bd6fa9d..04bcc2638ac 100644
--- a/docs/source/en/model_doc/flaubert.md
+++ b/docs/source/en/model_doc/flaubert.md
@@ -50,7 +50,7 @@ This model was contributed by [formiel](https://huggingface.co/formiel). The ori
 Tips:
 - Like RoBERTa, without the sentence ordering prediction (so just trained on the MLM objective).
 
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
@@ -66,6 +66,9 @@ Tips:
 
 [[autodoc]] FlaubertTokenizer
 
+<frameworkcontent>
+<pt>
+
 ## FlaubertModel
 
 [[autodoc]] FlaubertModel
@@ -101,6 +104,9 @@ Tips:
 [[autodoc]] FlaubertForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFFlaubertModel
 
 [[autodoc]] TFFlaubertModel
@@ -130,3 +136,9 @@ Tips:
 
 [[autodoc]] TFFlaubertForQuestionAnsweringSimple
     - call
+
+</tf>
+</frameworkcontent>
+
+
+
diff --git a/docs/source/en/model_doc/flava.md b/docs/source/en/model_doc/flava.md
index ae9da0d184a..d9f9f1de514 100644
--- a/docs/source/en/model_doc/flava.md
+++ b/docs/source/en/model_doc/flava.md
@@ -33,10 +33,8 @@ at once -- a true vision and language foundation model should be good at vision
 cross- and multi-modal vision and language tasks. We introduce FLAVA as such a model and demonstrate
 impressive performance on a wide range of 35 tasks spanning these target modalities.*
 
-
 This model was contributed by [aps](https://huggingface.co/aps). The original code can be found [here](https://github.com/facebookresearch/multimodal/tree/main/examples/flava).
 
-
 ## FlavaConfig
 
 [[autodoc]] FlavaConfig
diff --git a/docs/source/en/model_doc/fnet.md b/docs/source/en/model_doc/fnet.md
index a6d862f8a1a..1bcae678e63 100644
--- a/docs/source/en/model_doc/fnet.md
+++ b/docs/source/en/model_doc/fnet.md
@@ -37,15 +37,15 @@ sequence lengths on GPUs (and across relatively shorter lengths on TPUs). Finall
 and is particularly efficient at smaller model sizes; for a fixed speed and accuracy budget, small FNet models
 outperform Transformer counterparts.*
 
-Tips on usage:
-
-- The model was trained without an attention mask as it is based on Fourier Transform. The model was trained with
-  maximum sequence length 512 which includes pad tokens. Hence, it is highly recommended to use the same maximum
-  sequence length for fine-tuning and inference.
-
 This model was contributed by [gchhablani](https://huggingface.co/gchhablani). The original code can be found [here](https://github.com/google-research/google-research/tree/master/f_net).
 
-## Documentation resources
+## Usage tips
+
+The model was trained without an attention mask as it is based on Fourier Transform. The model was trained with 
+maximum sequence length 512 which includes pad tokens. Hence, it is highly recommended to use the same maximum 
+sequence length for fine-tuning and inference.
+
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
diff --git a/docs/source/en/model_doc/focalnet.md b/docs/source/en/model_doc/focalnet.md
index 21a75440b13..c4c97980f06 100644
--- a/docs/source/en/model_doc/focalnet.md
+++ b/docs/source/en/model_doc/focalnet.md
@@ -27,14 +27,9 @@ The abstract from the paper is the following:
 *We propose focal modulation networks (FocalNets in short), where self-attention (SA) is completely replaced by a focal modulation mechanism for modeling token interactions in vision. Focal modulation comprises three components: (i) hierarchical contextualization, implemented using a stack of depth-wise convolutional layers, to encode visual contexts from short to long ranges, (ii) gated aggregation to selectively gather contexts for each query token based on its
 content, and (iii) element-wise modulation or affine transformation to inject the aggregated context into the query. Extensive experiments show FocalNets outperform the state-of-the-art SA counterparts (e.g., Swin and Focal Transformers) with similar computational costs on the tasks of image classification, object detection, and segmentation. Specifically, FocalNets with tiny and base size achieve 82.3% and 83.9% top-1 accuracy on ImageNet-1K. After pretrained on ImageNet-22K in 224 resolution, it attains 86.5% and 87.3% top-1 accuracy when finetuned with resolution 224 and 384, respectively. When transferred to downstream tasks, FocalNets exhibit clear superiority. For object detection with Mask R-CNN, FocalNet base trained with 1\times outperforms the Swin counterpart by 2.1 points and already surpasses Swin trained with 3\times schedule (49.0 v.s. 48.5). For semantic segmentation with UPerNet, FocalNet base at single-scale outperforms Swin by 2.4, and beats Swin at multi-scale (50.5 v.s. 49.7). Using large FocalNet and Mask2former, we achieve 58.5 mIoU for ADE20K semantic segmentation, and 57.9 PQ for COCO Panoptic Segmentation. Using huge FocalNet and DINO, we achieved 64.3 and 64.4 mAP on COCO minival and test-dev, respectively, establishing new SoTA on top of much larger attention-based models like Swinv2-G and BEIT-3.*
 
-Tips:
-
-- One can use the [`AutoImageProcessor`] class to prepare images for the model.
-
 This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/microsoft/FocalNet).
 
-
 ## FocalNetConfig
 
 [[autodoc]] FocalNetConfig
diff --git a/docs/source/en/model_doc/fsmt.md b/docs/source/en/model_doc/fsmt.md
index 49625f6c472..9419dce71ed 100644
--- a/docs/source/en/model_doc/fsmt.md
+++ b/docs/source/en/model_doc/fsmt.md
@@ -16,9 +16,6 @@ rendered properly in your Markdown viewer.
 
 # FSMT
 
-**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign
-@stas00.
-
 ## Overview
 
 FSMT (FairSeq MachineTranslation) models were introduced in [Facebook FAIR's WMT19 News Translation Task Submission](https://arxiv.org/abs/1907.06616) by Nathan Ng, Kyra Yee, Alexei Baevski, Myle Ott, Michael Auli, Sergey Edunov.
diff --git a/docs/source/en/model_doc/funnel.md b/docs/source/en/model_doc/funnel.md
index 3cc4eb0aaed..d6929691f40 100644
--- a/docs/source/en/model_doc/funnel.md
+++ b/docs/source/en/model_doc/funnel.md
@@ -47,7 +47,9 @@ via a decoder. Empirically, with comparable or fewer FLOPs, Funnel-Transformer o
 a wide variety of sequence-level prediction tasks, including text classification, language understanding, and reading
 comprehension.*
 
-Tips:
+This model was contributed by [sgugger](https://huggingface.co/sgugger). The original code can be found [here](https://github.com/laiguokun/Funnel-Transformer).
+
+## Usage tips
 
 - Since Funnel Transformer uses pooling, the sequence length of the hidden states changes after each block of layers. This way, their length is divided by 2, which speeds up the computation of the next hidden states.
   The base model therefore has a final sequence length that is a quarter of the original one. This model can be used
@@ -62,9 +64,7 @@ Tips:
   [`FunnelBaseModel`], [`FunnelForSequenceClassification`] and
   [`FunnelForMultipleChoice`].
 
-This model was contributed by [sgugger](https://huggingface.co/sgugger). The original code can be found [here](https://github.com/laiguokun/Funnel-Transformer).
-
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
@@ -95,6 +95,9 @@ This model was contributed by [sgugger](https://huggingface.co/sgugger). The ori
 
 [[autodoc]] models.funnel.modeling_tf_funnel.TFFunnelForPreTrainingOutput
 
+<frameworkcontent>
+<pt>
+
 ## FunnelBaseModel
 
 [[autodoc]] FunnelBaseModel
@@ -135,6 +138,9 @@ This model was contributed by [sgugger](https://huggingface.co/sgugger). The ori
 [[autodoc]] FunnelForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFFunnelBaseModel
 
 [[autodoc]] TFFunnelBaseModel
@@ -174,3 +180,6 @@ This model was contributed by [sgugger](https://huggingface.co/sgugger). The ori
 
 [[autodoc]] TFFunnelForQuestionAnswering
     - call
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/git.md b/docs/source/en/model_doc/git.md
index b0c96200af3..bffa98b89e3 100644
--- a/docs/source/en/model_doc/git.md
+++ b/docs/source/en/model_doc/git.md
@@ -27,11 +27,6 @@ The abstract from the paper is the following:
 
 *In this paper, we design and train a Generative Image-to-text Transformer, GIT, to unify vision-language tasks such as image/video captioning and question answering. While generative models provide a consistent network architecture between pre-training and fine-tuning, existing work typically contains complex structures (uni/multi-modal encoder/decoder) and depends on external modules such as object detectors/taggers and optical character recognition (OCR). In GIT, we simplify the architecture as one image encoder and one text decoder under a single language modeling task. We also scale up the pre-training data and the model size to boost the model performance. Without bells and whistles, our GIT establishes new state of the arts on 12 challenging benchmarks with a large margin. For instance, our model surpasses the human performance for the first time on TextCaps (138.2 vs. 125.5 in CIDEr). Furthermore, we present a new scheme of generation-based image classification and scene text recognition, achieving decent performance on standard benchmarks.*
 
-Tips:
-
-- GIT is implemented in a very similar way to GPT-2, the only difference being that the model is also conditioned on `pixel_values`.
-- One can use [`GitProcessor`] to prepare images for the model, and the `generate` method for autoregressive generation.
-
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/git_architecture.jpg"
 alt="drawing" width="600"/>
 
@@ -40,6 +35,10 @@ alt="drawing" width="600"/>
 This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/microsoft/GenerativeImage2Text).
 
+## Usage tips
+
+- GIT is implemented in a very similar way to GPT-2, the only difference being that the model is also conditioned on `pixel_values`.
+
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with GIT.
diff --git a/docs/source/en/model_doc/glpn.md b/docs/source/en/model_doc/glpn.md
index be9a7d2d791..b57d1a7ccdd 100644
--- a/docs/source/en/model_doc/glpn.md
+++ b/docs/source/en/model_doc/glpn.md
@@ -33,10 +33,6 @@ The abstract from the paper is the following:
 
 *Depth estimation from a single image is an important task that can be applied to various fields in computer vision, and has grown rapidly with the development of convolutional neural networks. In this paper, we propose a novel structure and training strategy for monocular depth estimation to further improve the prediction accuracy of the network. We deploy a hierarchical transformer encoder to capture and convey the global context, and design a lightweight yet powerful decoder to generate an estimated depth map while considering local connectivity. By constructing connected paths between multi-scale local features and the global decoding stream with our proposed selective feature fusion module, the network can integrate both representations and recover fine details. In addition, the proposed decoder shows better performance than the previously proposed decoders, with considerably less computational complexity. Furthermore, we improve the depth-specific augmentation method by utilizing an important observation in depth estimation to enhance the model. Our network achieves state-of-the-art performance over the challenging depth dataset NYU Depth V2. Extensive experiments have been conducted to validate and show the effectiveness of the proposed approach. Finally, our model shows better generalisation ability and robustness than other comparative models.*
 
-Tips:
-
-- One can use [`GLPNImageProcessor`] to prepare images for the model.
-
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/glpn_architecture.jpg"
 alt="drawing" width="600"/>
 
diff --git a/docs/source/en/model_doc/gpt-sw3.md b/docs/source/en/model_doc/gpt-sw3.md
index 286cac12c99..f4d34a07212 100644
--- a/docs/source/en/model_doc/gpt-sw3.md
+++ b/docs/source/en/model_doc/gpt-sw3.md
@@ -32,12 +32,8 @@ causal language modeling (CLM) objective utilizing the NeMo Megatron GPT impleme
 
 This model was contributed by [AI Sweden](https://huggingface.co/AI-Sweden).
 
-The implementation uses the [GPT2Model](https://huggingface.co/docs/transformers/model_doc/gpt2) coupled
-with our `GPTSw3Tokenizer`. This means that `AutoTokenizer` and `AutoModelForCausalLM` map to our tokenizer
-implementation and the corresponding GPT2 model implementation respectively.
-*Note that sentencepiece is required to use our tokenizer and can be installed with:* `pip install transformers[sentencepiece]` or `pip install sentencepiece`
+## Usage example
 
-Example usage:
 ```python
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM
 
@@ -52,12 +48,21 @@ Example usage:
 Träd är fina för att de är färgstarka. Men ibland är det fint
 ```
 
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
 - [Causal language modeling task guide](../tasks/language_modeling)
 
+<Tip>
+
+The implementation uses the `GPT2Model` coupled with our `GPTSw3Tokenizer`. Refer to [GPT2Model documentation](gpt2) 
+for API reference and examples.  
+
+Note that sentencepiece is required to use our tokenizer and can be installed with `pip install transformers[sentencepiece]` or `pip install sentencepiece`
+
+</Tip>
+
 ## GPTSw3Tokenizer
 
 [[autodoc]] GPTSw3Tokenizer
diff --git a/docs/source/en/model_doc/gpt2.md b/docs/source/en/model_doc/gpt2.md
index 878bf84a3fa..4708edde0b6 100644
--- a/docs/source/en/model_doc/gpt2.md
+++ b/docs/source/en/model_doc/gpt2.md
@@ -39,7 +39,13 @@ text. The diversity of the dataset causes this simple goal to contain naturally
 across diverse domains. GPT-2 is a direct scale-up of GPT, with more than 10X the parameters and trained on more than
 10X the amount of data.*
 
-Tips:
+[Write With Transformer](https://transformer.huggingface.co/doc/gpt2-large) is a webapp created and hosted by
+Hugging Face showcasing the generative capabilities of several models. GPT-2 is one of them and is available in five
+different sizes: small, medium, large, xl and a distilled version of the small checkpoint: *distilgpt-2*.
+
+This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://openai.com/blog/better-language-models/).
+
+## Usage tips
 
 - GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
   the left.
@@ -54,12 +60,6 @@ Tips:
 - Enabling the *scale_attn_by_inverse_layer_idx* and *reorder_and_upcast_attn* flags will apply the training stability
   improvements from [Mistral](https://github.com/stanford-crfm/mistral/) (for PyTorch only).
 
-[Write With Transformer](https://transformer.huggingface.co/doc/gpt2-large) is a webapp created and hosted by
-Hugging Face showcasing the generative capabilities of several models. GPT-2 is one of them and is available in five
-different sizes: small, medium, large, xl and a distilled version of the small checkpoint: *distilgpt-2*.
-
-This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://openai.com/blog/better-language-models/).
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with GPT2. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
@@ -100,6 +100,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] models.gpt2.modeling_tf_gpt2.TFGPT2DoubleHeadsModelOutput
 
+<frameworkcontent>
+<pt>
+
 ## GPT2Model
 
 [[autodoc]] GPT2Model
@@ -130,6 +133,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] GPT2ForTokenClassification
     - forward
 
+</pt>
+<tf>
+
 ## TFGPT2Model
 
 [[autodoc]] TFGPT2Model
@@ -158,6 +164,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] TFGPT2Tokenizer
 
+</tf>
+<jax>
+
 ## FlaxGPT2Model
 
 [[autodoc]] FlaxGPT2Model
@@ -167,3 +176,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] FlaxGPT2LMHeadModel
     - __call__
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/gpt_bigcode.md b/docs/source/en/model_doc/gpt_bigcode.md
index 8cc77a825de..0f3bc72d03a 100644
--- a/docs/source/en/model_doc/gpt_bigcode.md
+++ b/docs/source/en/model_doc/gpt_bigcode.md
@@ -20,13 +20,13 @@ rendered properly in your Markdown viewer.
 
 The GPTBigCode model was proposed in [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by BigCode. The listed authors are: Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
 
-The abstract from the paper is the following:uery
+The abstract from the paper is the following:
 
 *The BigCode project is an open-scientific collaboration working on the responsible development of large language models for code. This tech report describes the progress of the collaboration until December 2022, outlining the current state of the Personally Identifiable Information (PII) redaction pipeline, the experiments conducted to de-risk the model architecture, and the experiments investigating better preprocessing methods for the training data. We train 1.1B parameter models on the Java, JavaScript, and Python subsets of The Stack and evaluate them on the MultiPL-E text-to-code benchmark. We find that more aggressive filtering of near-duplicates can further boost performance and, surprisingly, that selecting files from repositories with 5+ GitHub stars deteriorates performance significantly. Our best model outperforms previous open-source multilingual code generation models (InCoder-6.7B and CodeGen-Multi-2.7B) in both left-to-right generation and infilling on the Java, JavaScript, and Python portions of MultiPL-E, despite being a substantially smaller model. All models are released under an OpenRAIL license at [this https URL.](https://huggingface.co/bigcode)*
 
-The model is a an optimized [GPT2 model](https://huggingface.co/docs/transformers/model_doc/gpt2) with support for Multi-Query Attention.
+The model is an optimized [GPT2 model](https://huggingface.co/docs/transformers/model_doc/gpt2) with support for Multi-Query Attention.
 
-## Technical details
+## Implementation details
 
 The main differences compared to GPT2.
 - Added support for Multi-Query Attention.
@@ -85,7 +85,6 @@ Below is a expected speedup diagram that compares pure inference time between th
 
 [[autodoc]] GPTBigCodeConfig
 
-
 ## GPTBigCodeModel
 
 [[autodoc]] GPTBigCodeModel
@@ -96,7 +95,6 @@ Below is a expected speedup diagram that compares pure inference time between th
 [[autodoc]] GPTBigCodeForCausalLM
     - forward
 
-
 ## GPTBigCodeForSequenceClassification
 
 [[autodoc]] GPTBigCodeForSequenceClassification
diff --git a/docs/source/en/model_doc/gpt_neo.md b/docs/source/en/model_doc/gpt_neo.md
index 6b925aad10e..6a598ef6a96 100644
--- a/docs/source/en/model_doc/gpt_neo.md
+++ b/docs/source/en/model_doc/gpt_neo.md
@@ -27,7 +27,7 @@ The architecture is similar to GPT2 except that GPT Neo uses local attention in
 
 This model was contributed by [valhalla](https://huggingface.co/valhalla).
 
-### Generation
+## Usage example
 
 The `generate()` method can be used to generate text using GPT Neo model.
 
@@ -54,7 +54,7 @@ The `generate()` method can be used to generate text using GPT Neo model.
 >>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
 ```
 
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Causal language modeling task guide](../tasks/language_modeling)
@@ -63,6 +63,10 @@ The `generate()` method can be used to generate text using GPT Neo model.
 
 [[autodoc]] GPTNeoConfig
 
+
+<frameworkcontent>
+<pt>
+
 ## GPTNeoModel
 
 [[autodoc]] GPTNeoModel
@@ -88,6 +92,9 @@ The `generate()` method can be used to generate text using GPT Neo model.
 [[autodoc]] GPTNeoForTokenClassification
     - forward
 
+</pt>
+<jax>
+
 ## FlaxGPTNeoModel
 
 [[autodoc]] FlaxGPTNeoModel
@@ -97,3 +104,8 @@ The `generate()` method can be used to generate text using GPT Neo model.
 
 [[autodoc]] FlaxGPTNeoForCausalLM
     - __call__
+
+</jax>
+</frameworkcontent>
+
+
diff --git a/docs/source/en/model_doc/gpt_neox.md b/docs/source/en/model_doc/gpt_neox.md
index 0ee7c8630c6..300001ad5bb 100644
--- a/docs/source/en/model_doc/gpt_neox.md
+++ b/docs/source/en/model_doc/gpt_neox.md
@@ -38,7 +38,7 @@ model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b").half().cud
 GPT-NeoX-20B also has a different tokenizer from the one used in GPT-J-6B and GPT-Neo. The new tokenizer allocates
 additional tokens to whitespace characters, making the model more suitable for certain tasks like code generation.
 
-### Generation
+## Usage example
 
 The `generate()` method can be used to generate text using GPT Neo model.
 
@@ -61,7 +61,7 @@ The `generate()` method can be used to generate text using GPT Neo model.
 >>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
 ```
 
-## Documentation resources
+## Resources
 
 - [Causal language modeling task guide](../tasks/language_modeling)
 
diff --git a/docs/source/en/model_doc/gpt_neox_japanese.md b/docs/source/en/model_doc/gpt_neox_japanese.md
index c21ba838792..c69e643cae5 100644
--- a/docs/source/en/model_doc/gpt_neox_japanese.md
+++ b/docs/source/en/model_doc/gpt_neox_japanese.md
@@ -25,7 +25,7 @@ Following the recommendations from Google's research on [PaLM](https://ai.google
 
 Development of the model was led by [Shinya Otani](https://github.com/SO0529), [Takayoshi Makabe](https://github.com/spider-man-tm), [Anuj Arora](https://github.com/Anuj040), and [Kyo Hattori](https://github.com/go5paopao) from [ABEJA, Inc.](https://www.abejainc.com/). For more information on this model-building activity, please refer [here (ja)](https://tech-blog.abeja.asia/entry/abeja-gpt-project-202207).
 
-### Generation
+### Usage example
 
 The `generate()` method can be used to generate text using GPT NeoX Japanese model.
 
@@ -51,7 +51,7 @@ The `generate()` method can be used to generate text using GPT NeoX Japanese mod
 人とAIが協調するためには、AIと人が共存し、AIを正しく理解する必要があります。
 ```
 
-## Documentation resources
+## Resources
 
 - [Causal language modeling task guide](../tasks/language_modeling)
 
diff --git a/docs/source/en/model_doc/gptj.md b/docs/source/en/model_doc/gptj.md
index 5ad80a01095..b515cf36dd4 100644
--- a/docs/source/en/model_doc/gptj.md
+++ b/docs/source/en/model_doc/gptj.md
@@ -23,7 +23,7 @@ causal language model trained on [the Pile](https://pile.eleuther.ai/) dataset.
 
 This model was contributed by [Stella Biderman](https://huggingface.co/stellaathena).
 
-Tips:
+## Usage tips
 
 - To load [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) in float32 one would need at least 2x model size
   RAM: 1x for initial weights and another 1x to load the checkpoint. So for GPT-J it would take at least 48GB
@@ -56,7 +56,7 @@ Tips:
   size, the tokenizer for [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) contains 143 extra tokens
   `<|extratoken_1|>... <|extratoken_143|>`, so the `vocab_size` of tokenizer also becomes 50400.
 
-### Generation
+## Usage examples
 
 The [`~generation.GenerationMixin.generate`] method can be used to generate text using GPT-J
 model.
@@ -138,6 +138,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] GPTJConfig
     - all
 
+<frameworkcontent>
+<pt>
+
 ## GPTJModel
 
 [[autodoc]] GPTJModel
@@ -158,6 +161,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] GPTJForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFGPTJModel
 
 [[autodoc]] TFGPTJModel
@@ -178,6 +184,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] TFGPTJForQuestionAnswering
     - call
 
+</tf>
+<jax>
+
 ## FlaxGPTJModel
 
 [[autodoc]] FlaxGPTJModel
@@ -187,3 +196,5 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] FlaxGPTJForCausalLM
     - __call__
+</jax>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/gptsan-japanese.md b/docs/source/en/model_doc/gptsan-japanese.md
index 48f67f85065..1e6b1b6e1cf 100644
--- a/docs/source/en/model_doc/gptsan-japanese.md
+++ b/docs/source/en/model_doc/gptsan-japanese.md
@@ -24,7 +24,7 @@ GPTSAN is a Japanese language model using Switch Transformer. It has the same st
 in the T5 paper, and support both Text Generation and Masked Language Modeling tasks. These basic tasks similarly can
 fine-tune for translation or summarization.
 
-### Generation
+### Usage example
 
 The `generate()` method can be used to generate text using GPTSAN-Japanese model.
 
@@ -56,7 +56,7 @@ This length applies to the text entered in `prefix_text` for the tokenizer.
 The tokenizer returns the mask of the `Prefix` part of Prefix-LM as `token_type_ids`.
 The model treats the part where `token_type_ids` is 1 as a `Prefix` part, that is, the input can refer to both tokens before and after.
 
-Tips:
+## Usage tips
 
 Specifying the Prefix part is done with a mask passed to self-attention.
 When token_type_ids=None or all zero, it is equivalent to regular causal mask
diff --git a/docs/source/en/model_doc/graphormer.md b/docs/source/en/model_doc/graphormer.md
index 16d61bccbef..08e3f5fb3e9 100644
--- a/docs/source/en/model_doc/graphormer.md
+++ b/docs/source/en/model_doc/graphormer.md
@@ -23,26 +23,24 @@ The abstract from the paper is the following:
 
 *The Transformer architecture has become a dominant choice in many domains, such as natural language processing and computer vision. Yet, it has not achieved competitive performance on popular leaderboards of graph-level prediction compared to mainstream GNN variants. Therefore, it remains a mystery how Transformers could perform well for graph representation learning. In this paper, we solve this mystery by presenting Graphormer, which is built upon the standard Transformer architecture, and could attain excellent results on a broad range of graph representation learning tasks, especially on the recent OGB Large-Scale Challenge. Our key insight to utilizing Transformer in the graph is the necessity of effectively encoding the structural information of a graph into the model. To this end, we propose several simple yet effective structural encoding methods to help Graphormer better model graph-structured data. Besides, we mathematically characterize the expressive power of Graphormer and exhibit that with our ways of encoding the structural information of graphs, many popular GNN variants could be covered as the special cases of Graphormer.*
 
-Tips:
+This model was contributed by [clefourrier](https://huggingface.co/clefourrier). The original code can be found [here](https://github.com/microsoft/Graphormer).
+
+## Usage tips
 
 This model will not work well on large graphs (more than 100 nodes/edges), as it will make the memory explode.
 You can reduce the batch size, increase your RAM, or decrease the `UNREACHABLE_NODE_DISTANCE` parameter in algos_graphormer.pyx, but it will be hard to go above 700 nodes/edges.
 
 This model does not use a tokenizer, but instead a special collator during training.
 
-This model was contributed by [clefourrier](https://huggingface.co/clefourrier). The original code can be found [here](https://github.com/microsoft/Graphormer).
-
 ## GraphormerConfig
 
 [[autodoc]] GraphormerConfig
 
-
 ## GraphormerModel
 
 [[autodoc]] GraphormerModel
     - forward
 
-
 ## GraphormerForGraphClassification
 
 [[autodoc]] GraphormerForGraphClassification
diff --git a/docs/source/en/model_doc/groupvit.md b/docs/source/en/model_doc/groupvit.md
index cf006e284b1..8728cf0da21 100644
--- a/docs/source/en/model_doc/groupvit.md
+++ b/docs/source/en/model_doc/groupvit.md
@@ -25,13 +25,13 @@ The abstract from the paper is the following:
 
 *Grouping and recognition are important components of visual scene understanding, e.g., for object detection and semantic segmentation. With end-to-end deep learning systems, grouping of image regions usually happens implicitly via top-down supervision from pixel-level recognition labels. Instead, in this paper, we propose to bring back the grouping mechanism into deep networks, which allows semantic segments to emerge automatically with only text supervision. We propose a hierarchical Grouping Vision Transformer (GroupViT), which goes beyond the regular grid structure representation and learns to group image regions into progressively larger arbitrary-shaped segments. We train GroupViT jointly with a text encoder on a large-scale image-text dataset via contrastive losses. With only text supervision and without any pixel-level annotations, GroupViT learns to group together semantic regions and successfully transfers to the task of semantic segmentation in a zero-shot manner, i.e., without any further fine-tuning. It achieves a zero-shot accuracy of 52.3% mIoU on the PASCAL VOC 2012 and 22.4% mIoU on PASCAL Context datasets, and performs competitively to state-of-the-art transfer-learning methods requiring greater levels of supervision.*
 
-Tips:
-
-- You may specify `output_segmentation=True` in the forward of `GroupViTModel` to get the segmentation logits of input texts. 
-
 This model was contributed by [xvjiarui](https://huggingface.co/xvjiarui). The TensorFlow version was contributed by [ariG23498](https://huggingface.co/ariG23498) with the help of [Yih-Dar SHIEH](https://huggingface.co/ydshieh), [Amy Roberts](https://huggingface.co/amyeroberts), and [Joao Gante](https://huggingface.co/joaogante).
 The original code can be found [here](https://github.com/NVlabs/GroupViT).
 
+## Usage tips
+ 
+- You may specify `output_segmentation=True` in the forward of `GroupViTModel` to get the segmentation logits of input texts. 
+
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with GroupViT.
@@ -52,6 +52,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] GroupViTVisionConfig
 
+<frameworkcontent>
+<pt>
+
 ## GroupViTModel
 
 [[autodoc]] GroupViTModel
@@ -69,6 +72,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] GroupViTVisionModel
     - forward
 
+</pt>
+<tf>
+
 ## TFGroupViTModel
 
 [[autodoc]] TFGroupViTModel
@@ -84,4 +90,7 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 ## TFGroupViTVisionModel
 
 [[autodoc]] TFGroupViTVisionModel
-    - call
\ No newline at end of file
+    - call
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/herbert.md b/docs/source/en/model_doc/herbert.md
index ee927bddb02..0049d6bfcf3 100644
--- a/docs/source/en/model_doc/herbert.md
+++ b/docs/source/en/model_doc/herbert.md
@@ -37,7 +37,11 @@ which has the best average performance and obtains the best results for three ou
 extensive evaluation, including several standard baselines and recently proposed, multilingual Transformer-based
 models.*
 
-Examples of use:
+This model was contributed by [rmroczkowski](https://huggingface.co/rmroczkowski). The original code can be found
+[here](https://github.com/allegro/HerBERT).
+
+
+## Usage example
 
 ```python
 >>> from transformers import HerbertTokenizer, RobertaModel
@@ -56,9 +60,12 @@ Examples of use:
 >>> model = AutoModel.from_pretrained("allegro/herbert-klej-cased-v1")
 ```
 
-This model was contributed by [rmroczkowski](https://huggingface.co/rmroczkowski). The original code can be found
-[here](https://github.com/allegro/HerBERT).
+<Tip>
 
+Herbert implementation is the same as `BERT` except for the tokenization method. Refer to [BERT documentation](bert) 
+for API reference and examples.  
+
+</Tip>
 
 ## HerbertTokenizer
 
diff --git a/docs/source/en/model_doc/hubert.md b/docs/source/en/model_doc/hubert.md
index 5349e138852..43ce590d371 100644
--- a/docs/source/en/model_doc/hubert.md
+++ b/docs/source/en/model_doc/hubert.md
@@ -36,15 +36,15 @@ state-of-the-art wav2vec 2.0 performance on the Librispeech (960h) and Libri-lig
 10h, 100h, and 960h fine-tuning subsets. Using a 1B parameter model, HuBERT shows up to 19% and 13% relative WER
 reduction on the more challenging dev-other and test-other evaluation subsets.*
 
-Tips:
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).
+
+# Usage tips
 
 - Hubert is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
 - Hubert model was fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded
   using [`Wav2Vec2CTCTokenizer`].
 
-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).
-
-## Documentation resources
+## Resources
 
 - [Audio classification task guide](../tasks/audio_classification)
 - [Automatic speech recognition task guide](../tasks/asr)
@@ -53,6 +53,9 @@ This model was contributed by [patrickvonplaten](https://huggingface.co/patrickv
 
 [[autodoc]] HubertConfig
 
+<frameworkcontent>
+<pt>
+
 ## HubertModel
 
 [[autodoc]] HubertModel
@@ -68,6 +71,9 @@ This model was contributed by [patrickvonplaten](https://huggingface.co/patrickv
 [[autodoc]] HubertForSequenceClassification
     - forward
 
+</pt>
+<tf>
+
 ## TFHubertModel
 
 [[autodoc]] TFHubertModel
@@ -77,3 +83,6 @@ This model was contributed by [patrickvonplaten](https://huggingface.co/patrickv
 
 [[autodoc]] TFHubertForCTC
     - call
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/ibert.md b/docs/source/en/model_doc/ibert.md
index 9c5f9c3e8de..9ea623951ae 100644
--- a/docs/source/en/model_doc/ibert.md
+++ b/docs/source/en/model_doc/ibert.md
@@ -40,7 +40,7 @@ been open-sourced.*
 
 This model was contributed by [kssteven](https://huggingface.co/kssteven). The original code can be found [here](https://github.com/kssteven418/I-BERT).
 
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
diff --git a/docs/source/en/model_doc/idefics.md b/docs/source/en/model_doc/idefics.md
index e0017df0c52..9989f89d682 100644
--- a/docs/source/en/model_doc/idefics.md
+++ b/docs/source/en/model_doc/idefics.md
@@ -31,9 +31,9 @@ This model was contributed by [HuggingFaceM4](https://huggingface.co/HuggingFace
 
 <Tip warning={true}>
 
-Idefics modeling code in Transformers is for finetuning and inferencing the pre-trained Idefics models.
+IDEFICS modeling code in Transformers is for finetuning and inferencing the pre-trained IDEFICS models.
 
-To train a new Idefics model from scratch use the m4 codebase (a link will be provided once it's made public)
+To train a new IDEFICS model from scratch use the m4 codebase (a link will be provided once it's made public)
 
 </Tip>
 
diff --git a/docs/source/en/model_doc/imagegpt.md b/docs/source/en/model_doc/imagegpt.md
index 01eb7dde5fc..53a7ba3b34b 100644
--- a/docs/source/en/model_doc/imagegpt.md
+++ b/docs/source/en/model_doc/imagegpt.md
@@ -40,7 +40,7 @@ alt="drawing" width="600"/>
 This model was contributed by [nielsr](https://huggingface.co/nielsr), based on [this issue](https://github.com/openai/image-gpt/issues/7). The original code can be found
 [here](https://github.com/openai/image-gpt).
 
-Tips:
+## Usage tips
 
 - ImageGPT is almost exactly the same as [GPT-2](gpt2), with the exception that a different activation
   function is used (namely "quick gelu"), and the layer normalization layers don't mean center the inputs. ImageGPT
@@ -92,7 +92,6 @@ If you're interested in submitting a resource to be included here, please feel f
 ## ImageGPTFeatureExtractor
 
 [[autodoc]] ImageGPTFeatureExtractor
-
     - __call__
 
 ## ImageGPTImageProcessor
@@ -103,17 +102,14 @@ If you're interested in submitting a resource to be included here, please feel f
 ## ImageGPTModel
 
 [[autodoc]] ImageGPTModel
-
     - forward
 
 ## ImageGPTForCausalImageModeling
 
 [[autodoc]] ImageGPTForCausalImageModeling
-
     - forward
 
 ## ImageGPTForImageClassification
 
 [[autodoc]] ImageGPTForImageClassification
-
     - forward
diff --git a/docs/source/en/model_doc/informer.md b/docs/source/en/model_doc/informer.md
index 0d2d82a3f57..8100b284432 100644
--- a/docs/source/en/model_doc/informer.md
+++ b/docs/source/en/model_doc/informer.md
@@ -39,13 +39,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] InformerConfig
 
-
 ## InformerModel
 
 [[autodoc]] InformerModel
     - forward
 
-
 ## InformerForPrediction
 
 [[autodoc]] InformerForPrediction
diff --git a/docs/source/en/model_doc/instructblip.md b/docs/source/en/model_doc/instructblip.md
index d2cf80e50a5..1a693493fff 100644
--- a/docs/source/en/model_doc/instructblip.md
+++ b/docs/source/en/model_doc/instructblip.md
@@ -21,10 +21,6 @@ The abstract from the paper is the following:
 
 *General-purpose language models that can solve various language-domain tasks have emerged driven by the pre-training and instruction-tuning pipeline. However, building general-purpose vision-language models is challenging due to the increased task discrepancy introduced by the additional visual input. Although vision-language pre-training has been widely studied, vision-language instruction tuning remains relatively less explored. In this paper, we conduct a systematic and comprehensive study on vision-language instruction tuning based on the pre-trained BLIP-2 models. We gather a wide variety of 26 publicly available datasets, transform them into instruction tuning format and categorize them into two clusters for held-in instruction tuning and held-out zero-shot evaluation. Additionally, we introduce instruction-aware visual feature extraction, a crucial method that enables the model to extract informative features tailored to the given instruction. The resulting InstructBLIP models achieve state-of-the-art zero-shot performance across all 13 held-out datasets, substantially outperforming BLIP-2 and the larger Flamingo. Our models also lead to state-of-the-art performance when finetuned on individual downstream tasks (e.g., 90.7% accuracy on ScienceQA IMG). Furthermore, we qualitatively demonstrate the advantages of InstructBLIP over concurrent multimodal models.*
 
-Tips:
-
-- InstructBLIP uses the same architecture as [BLIP-2](blip2) with a tiny but important difference: it also feeds the text prompt (instruction) to the Q-Former.
-
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/instructblip_architecture.jpg"
 alt="drawing" width="600"/>
 
@@ -33,6 +29,9 @@ alt="drawing" width="600"/>
 This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/salesforce/LAVIS/tree/main/projects/instructblip).
 
+## Usage tips
+
+InstructBLIP uses the same architecture as [BLIP-2](blip2) with a tiny but important difference: it also feeds the text prompt (instruction) to the Q-Former.
 
 ## InstructBlipConfig
 
diff --git a/docs/source/en/model_doc/jukebox.md b/docs/source/en/model_doc/jukebox.md
index 24a80164a2d..a6d865d86cc 100644
--- a/docs/source/en/model_doc/jukebox.md
+++ b/docs/source/en/model_doc/jukebox.md
@@ -32,7 +32,11 @@ The metadata such as *artist, genre and timing* are passed to each prior, in the
 
 ![JukeboxModel](https://gist.githubusercontent.com/ArthurZucker/92c1acaae62ebf1b6a951710bdd8b6af/raw/c9c517bf4eff61393f6c7dec9366ef02bdd059a3/jukebox.svg)
 
-Tips:
+This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ).
+The original code can be found [here](https://github.com/openai/jukebox).
+
+## Usage tips
+
 - This model only supports inference. This is for a few reasons, mostly because it requires a crazy amount of memory to train. Feel free to open a PR and add what's missing to have a full integration with the hugging face traineer!
 - This model is very slow, and takes 8h to generate a minute long audio using the 5b top prior on a V100 GPU. In order automaticallay handle the device on which the model should execute, use `accelerate`.
 - Contrary to the paper, the order of the priors goes from `0` to `1` as it felt more intuitive : we sample starting from `0`.
@@ -67,14 +71,12 @@ The original code can be found [here](https://github.com/openai/jukebox).
     - upsample
     - _sample
 
-
 ## JukeboxPrior
 
 [[autodoc]] JukeboxPrior
     - sample
     - forward
 
-
 ## JukeboxVQVAE
 
 [[autodoc]] JukeboxVQVAE
diff --git a/docs/source/en/model_doc/layoutlm.md b/docs/source/en/model_doc/layoutlm.md
index ebf6b1a4b4f..34b429fb737 100644
--- a/docs/source/en/model_doc/layoutlm.md
+++ b/docs/source/en/model_doc/layoutlm.md
@@ -46,7 +46,7 @@ document-level pretraining. It achieves new state-of-the-art results in several
 understanding (from 70.72 to 79.27), receipt understanding (from 94.02 to 95.24) and document image classification
 (from 93.07 to 94.42).*
 
-Tips:
+## Usage tips
 
 - In addition to *input_ids*, [`~transformers.LayoutLMModel.forward`] also expects the input `bbox`, which are
   the bounding boxes (i.e. 2D-positions) of the input tokens. These can be obtained using an external OCR engine such
@@ -123,6 +123,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] LayoutLMTokenizerFast
 
+<frameworkcontent>
+<pt>
+
 ## LayoutLMModel
 
 [[autodoc]] LayoutLMModel
@@ -143,6 +146,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] LayoutLMForQuestionAnswering
 
+</pt>
+<tf>
+
 ## TFLayoutLMModel
 
 [[autodoc]] TFLayoutLMModel
@@ -162,3 +168,8 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 ## TFLayoutLMForQuestionAnswering
 
 [[autodoc]] TFLayoutLMForQuestionAnswering
+
+</tf>
+</frameworkcontent>
+
+
diff --git a/docs/source/en/model_doc/layoutlmv2.md b/docs/source/en/model_doc/layoutlmv2.md
index f2a1c65a42b..15286d4ddb7 100644
--- a/docs/source/en/model_doc/layoutlmv2.md
+++ b/docs/source/en/model_doc/layoutlmv2.md
@@ -56,7 +56,7 @@ python -m pip install torchvision tesseract
 ```
 (If you are developing for LayoutLMv2, note that passing the doctests also requires the installation of these packages.)
 
-Tips:
+## Usage tips
 
 - The main difference between LayoutLMv1 and LayoutLMv2 is that the latter incorporates visual embeddings during
   pre-training (while LayoutLMv1 only adds visual embeddings during fine-tuning).
diff --git a/docs/source/en/model_doc/layoutlmv3.md b/docs/source/en/model_doc/layoutlmv3.md
index 22e2c3ff718..87ff32f3835 100644
--- a/docs/source/en/model_doc/layoutlmv3.md
+++ b/docs/source/en/model_doc/layoutlmv3.md
@@ -26,16 +26,6 @@ The abstract from the paper is the following:
 
 *Self-supervised pre-training techniques have achieved remarkable progress in Document AI. Most multimodal pre-trained models use a masked language modeling objective to learn bidirectional representations on the text modality, but they differ in pre-training objectives for the image modality. This discrepancy adds difficulty to multimodal representation learning. In this paper, we propose LayoutLMv3 to pre-train multimodal Transformers for Document AI with unified text and image masking. Additionally, LayoutLMv3 is pre-trained with a word-patch alignment objective to learn cross-modal alignment by predicting whether the corresponding image patch of a text word is masked. The simple unified architecture and training objectives make LayoutLMv3 a general-purpose pre-trained model for both text-centric and image-centric Document AI tasks. Experimental results show that LayoutLMv3 achieves state-of-the-art performance not only in text-centric tasks, including form understanding, receipt understanding, and document visual question answering, but also in image-centric tasks such as document image classification and document layout analysis.*
 
-Tips:
-
-- In terms of data processing, LayoutLMv3 is identical to its predecessor [LayoutLMv2](layoutlmv2), except that:
-    - images need to be resized and normalized with channels in regular RGB format. LayoutLMv2 on the other hand normalizes the images internally and expects the channels in BGR format.
-    - text is tokenized using byte-pair encoding (BPE), as opposed to WordPiece.
-  Due to these differences in data preprocessing, one can use [`LayoutLMv3Processor`] which internally combines a [`LayoutLMv3ImageProcessor`] (for the image modality) and a [`LayoutLMv3Tokenizer`]/[`LayoutLMv3TokenizerFast`] (for the text modality) to prepare all data for the model.
-- Regarding usage of [`LayoutLMv3Processor`], we refer to the [usage guide](layoutlmv2#usage-layoutlmv2processor) of its predecessor.
-- Demo notebooks for LayoutLMv3 can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/LayoutLMv3).
-- Demo scripts can be found [here](https://github.com/huggingface/transformers/tree/main/examples/research_projects/layoutlmv3).
-
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/layoutlmv3_architecture.png"
 alt="drawing" width="600"/>
 
@@ -43,6 +33,14 @@ alt="drawing" width="600"/>
 
 This model was contributed by [nielsr](https://huggingface.co/nielsr). The TensorFlow version of this model was added by [chriskoo](https://huggingface.co/chriskoo), [tokec](https://huggingface.co/tokec), and [lre](https://huggingface.co/lre). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/layoutlmv3).
 
+## Usage tips
+
+- In terms of data processing, LayoutLMv3 is identical to its predecessor [LayoutLMv2](layoutlmv2), except that:
+    - images need to be resized and normalized with channels in regular RGB format. LayoutLMv2 on the other hand normalizes the images internally and expects the channels in BGR format.
+    - text is tokenized using byte-pair encoding (BPE), as opposed to WordPiece.
+  Due to these differences in data preprocessing, one can use [`LayoutLMv3Processor`] which internally combines a [`LayoutLMv3ImageProcessor`] (for the image modality) and a [`LayoutLMv3Tokenizer`]/[`LayoutLMv3TokenizerFast`] (for the text modality) to prepare all data for the model.
+- Regarding usage of [`LayoutLMv3Processor`], we refer to the [usage guide](layoutlmv2#usage-layoutlmv2processor) of its predecessor.
+
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LayoutLMv3. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
@@ -53,6 +51,9 @@ LayoutLMv3 is nearly identical to LayoutLMv2, so we've also included LayoutLMv2
 
 </Tip>
 
+- Demo notebooks for LayoutLMv3 can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/LayoutLMv3).
+- Demo scripts can be found [here](https://github.com/huggingface/transformers/tree/main/examples/research_projects/layoutlmv3).
+
 <PipelineTag pipeline="text-classification"/>
 
 - [`LayoutLMv2ForSequenceClassification`] is supported by this [notebook](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/RVL-CDIP/Fine_tuning_LayoutLMv2ForSequenceClassification_on_RVL_CDIP.ipynb).
@@ -103,6 +104,9 @@ LayoutLMv3 is nearly identical to LayoutLMv2, so we've also included LayoutLMv2
 [[autodoc]] LayoutLMv3Processor
     - __call__
 
+<frameworkcontent>
+<pt>
+
 ## LayoutLMv3Model
 
 [[autodoc]] LayoutLMv3Model
@@ -123,6 +127,9 @@ LayoutLMv3 is nearly identical to LayoutLMv2, so we've also included LayoutLMv2
 [[autodoc]] LayoutLMv3ForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFLayoutLMv3Model
 
 [[autodoc]] TFLayoutLMv3Model
@@ -142,3 +149,6 @@ LayoutLMv3 is nearly identical to LayoutLMv2, so we've also included LayoutLMv2
 
 [[autodoc]] TFLayoutLMv3ForQuestionAnswering
     - call
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/layoutxlm.md b/docs/source/en/model_doc/layoutxlm.md
index 8858560bbb2..f6b2cbef9d6 100644
--- a/docs/source/en/model_doc/layoutxlm.md
+++ b/docs/source/en/model_doc/layoutxlm.md
@@ -33,6 +33,10 @@ introduce a multilingual form understanding benchmark dataset named XFUN, which
 for each language. Experiment results show that the LayoutXLM model has significantly outperformed the existing SOTA
 cross-lingual pre-trained models on the XFUN dataset.*
 
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/microsoft/unilm).
+
+## Usage tips and examples
+
 One can directly plug in the weights of LayoutXLM into a LayoutLMv2 model, like so:
 
 ```python
@@ -56,10 +60,10 @@ Similar to LayoutLMv2, you can use [`LayoutXLMProcessor`] (which internally appl
 [`LayoutXLMTokenizer`]/[`LayoutXLMTokenizerFast`] in sequence) to prepare all
 data for the model.
 
+<Tip>
+
 As LayoutXLM's architecture is equivalent to that of LayoutLMv2, one can refer to [LayoutLMv2's documentation page](layoutlmv2) for all tips, code examples and notebooks.
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/microsoft/unilm).
-
+</Tip>
 
 ## LayoutXLMTokenizer
 
diff --git a/docs/source/en/model_doc/led.md b/docs/source/en/model_doc/led.md
index 9ba9383a59d..9a39b0b28ed 100644
--- a/docs/source/en/model_doc/led.md
+++ b/docs/source/en/model_doc/led.md
@@ -35,7 +35,7 @@ WikiHop and TriviaQA. We finally introduce the Longformer-Encoder-Decoder (LED),
 long document generative sequence-to-sequence tasks, and demonstrate its effectiveness on the arXiv summarization
 dataset.*
 
-Tips:
+## Usage tips
 
 - [`LEDForConditionalGeneration`] is an extension of
   [`BartForConditionalGeneration`] exchanging the traditional *self-attention* layer with
@@ -52,15 +52,15 @@ Tips:
   errors. This can be done by executing `model.gradient_checkpointing_enable()`. 
  Moreover, the `use_cache=False`
   flag can be used to disable the caching mechanism to save memory.
-- A notebook showing how to evaluate LED, can be accessed [here](https://colab.research.google.com/drive/12INTTR6n64TzS4RrXZxMSXfrOd9Xzamo?usp=sharing).
-- A notebook showing how to fine-tune LED, can be accessed [here](https://colab.research.google.com/drive/12LjJazBl7Gam0XBPy_y0CTOJZeZ34c2v?usp=sharing).
 - LED is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
   the left.
 
 This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).
 
-## Documentation resources
+## Resources
 
+- [A notebook showing how to evaluate LED](https://colab.research.google.com/drive/12INTTR6n64TzS4RrXZxMSXfrOd9Xzamo?usp=sharing).
+- [A notebook showing how to fine-tune LED](https://colab.research.google.com/drive/12LjJazBl7Gam0XBPy_y0CTOJZeZ34c2v?usp=sharing).
 - [Text classification task guide](../tasks/sequence_classification)
 - [Question answering task guide](../tasks/question_answering)
 - [Translation task guide](../tasks/translation)
@@ -100,6 +100,9 @@ This model was contributed by [patrickvonplaten](https://huggingface.co/patrickv
 
 [[autodoc]] models.led.modeling_tf_led.TFLEDSeq2SeqLMOutput
 
+<frameworkcontent>
+<pt>
+
 ## LEDModel
 
 [[autodoc]] LEDModel
@@ -120,6 +123,9 @@ This model was contributed by [patrickvonplaten](https://huggingface.co/patrickv
 [[autodoc]] LEDForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFLEDModel
 
 [[autodoc]] TFLEDModel
@@ -129,3 +135,9 @@ This model was contributed by [patrickvonplaten](https://huggingface.co/patrickv
 
 [[autodoc]] TFLEDForConditionalGeneration
     - call
+
+</tf>
+</frameworkcontent>
+
+
+
diff --git a/docs/source/en/model_doc/levit.md b/docs/source/en/model_doc/levit.md
index 8145be775f5..15dc2f4e137 100644
--- a/docs/source/en/model_doc/levit.md
+++ b/docs/source/en/model_doc/levit.md
@@ -38,7 +38,9 @@ alt="drawing" width="600"/>
 
 <small> LeViT Architecture. Taken from the <a href="https://arxiv.org/abs/2104.01136">original paper</a>.</small>
 
-Tips:
+This model was contributed by [anugunj](https://huggingface.co/anugunj). The original code can be found [here](https://github.com/facebookresearch/LeViT).
+
+## Usage tips
 
 - Compared to ViT, LeViT models use an additional distillation head to effectively learn from a teacher (which, in the LeViT paper, is a ResNet like-model). The distillation head is learned through backpropagation under supervision of a ResNet like-model. They also draw inspiration from convolution neural networks to use activation maps with decreasing resolutions to increase the efficiency.
 - There are 2 ways to fine-tune distilled models, either (1) in a classic way, by only placing a prediction head on top
@@ -63,8 +65,6 @@ Tips:
 - You can check out demo notebooks regarding inference as well as fine-tuning on custom data [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer)
   (you can just replace [`ViTFeatureExtractor`] by [`LevitImageProcessor`] and [`ViTForImageClassification`] by [`LevitForImageClassification`] or [`LevitForImageClassificationWithTeacher`]).
 
-This model was contributed by [anugunj](https://huggingface.co/anugunj). The original code can be found [here](https://github.com/facebookresearch/LeViT).
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LeViT.
@@ -90,7 +90,6 @@ If you're interested in submitting a resource to be included here, please feel f
   [[autodoc]] LevitImageProcessor
     - preprocess
 
-
 ## LevitModel
 
 [[autodoc]] LevitModel
diff --git a/docs/source/en/model_doc/lilt.md b/docs/source/en/model_doc/lilt.md
index 901deefd7ff..fb279573fbf 100644
--- a/docs/source/en/model_doc/lilt.md
+++ b/docs/source/en/model_doc/lilt.md
@@ -26,7 +26,15 @@ The abstract from the paper is the following:
 
 *Structured document understanding has attracted considerable attention and made significant progress recently, owing to its crucial role in intelligent document processing. However, most existing related models can only deal with the document data of specific language(s) (typically English) included in the pre-training collection, which is extremely limited. To address this issue, we propose a simple yet effective Language-independent Layout Transformer (LiLT) for structured document understanding. LiLT can be pre-trained on the structured documents of a single language and then directly fine-tuned on other languages with the corresponding off-the-shelf monolingual/multilingual pre-trained textual models. Experimental results on eight languages have shown that LiLT can achieve competitive or even superior performance on diverse widely-used downstream benchmarks, which enables language-independent benefit from the pre-training of document layout structure.*
 
-Tips:
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/lilt_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> LiLT architecture. Taken from the <a href="https://arxiv.org/abs/2202.13669">original paper</a>. </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/jpwang/lilt).
+
+## Usage tips
 
 - To combine the Language-Independent Layout Transformer with a new RoBERTa checkpoint from the [hub](https://huggingface.co/models?search=roberta), refer to [this guide](https://github.com/jpWang/LiLT#or-generate-your-own-checkpoint-optional).
 The script will result in `config.json` and `pytorch_model.bin` files being stored locally. After doing this, one can do the following (assuming you're logged in with your HuggingFace account):
@@ -42,14 +50,6 @@ model.push_to_hub("name_of_repo_on_the_hub")
 - As [lilt-roberta-en-base](https://huggingface.co/SCUT-DLVCLab/lilt-roberta-en-base) uses the same vocabulary as [LayoutLMv3](layoutlmv3), one can use [`LayoutLMv3TokenizerFast`] to prepare data for the model.
 The same is true for [lilt-roberta-en-base](https://huggingface.co/SCUT-DLVCLab/lilt-infoxlm-base): one can use [`LayoutXLMTokenizerFast`] for that model.
 
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/lilt_architecture.jpg"
-alt="drawing" width="600"/>
-
-<small> LiLT architecture. Taken from the <a href="https://arxiv.org/abs/2202.13669">original paper</a>. </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr).
-The original code can be found [here](https://github.com/jpwang/lilt).
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LiLT.
diff --git a/docs/source/en/model_doc/llama.md b/docs/source/en/model_doc/llama.md
index e63e4b1ab3b..9f55c425d44 100644
--- a/docs/source/en/model_doc/llama.md
+++ b/docs/source/en/model_doc/llama.md
@@ -24,7 +24,9 @@ The abstract from the paper is the following:
 
 *We introduce LLaMA, a collection of foundation language models ranging from 7B to 65B parameters. We train our models on trillions of tokens, and show that it is possible to train state-of-the-art models using publicly available datasets exclusively, without resorting to proprietary and inaccessible datasets. In particular, LLaMA-13B outperforms GPT-3 (175B) on most benchmarks, and LLaMA-65B is competitive with the best models, Chinchilla-70B and PaLM-540B. We release all our models to the research community. *
 
-Tips:
+This model was contributed by [zphang](https://huggingface.co/zphang) with contributions from [BlackSamorez](https://huggingface.co/BlackSamorez). The code of the implementation in Hugging Face is based on GPT-NeoX [here](https://github.com/EleutherAI/gpt-neox). The original code of the authors can be found [here](https://github.com/facebookresearch/llama).
+
+## Usage tips
 
 - Weights for the LLaMA models can be obtained from by filling out [this form](https://docs.google.com/forms/d/e/1FAIpQLSfqNECQnMkycAp2jP4Z9TFX0cGR4uf7b_fBxjY_OjhJILlKGA/viewform?usp=send_form)
 - After downloading the weights, they will need to be converted to the Hugging Face Transformers format using the [conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py). The script can be called with the following (example) command:
@@ -48,9 +50,6 @@ come in several checkpoints they each contain a part of each weight of the model
 
 - The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string.
 
-This model was contributed by [zphang](https://huggingface.co/zphang) with contributions from [BlackSamorez](https://huggingface.co/BlackSamorez). The code of the implementation in Hugging Face is based on GPT-NeoX [here](https://github.com/EleutherAI/gpt-neox). The original code of the authors can be found [here](https://github.com/facebookresearch/llama).
-
-
 Based on the original LLaMA model, Meta AI has released some follow-up works:
 
 - **Llama2**: Llama2 is an improved version of Llama with some architectural tweaks (Grouped Query Attention), and is pre-trained on 2Trillion tokens. Refer to the documentation of Llama2 which can be found [here](llama2).
@@ -82,7 +81,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] LlamaConfig
 
-
 ## LlamaTokenizer
 
 [[autodoc]] LlamaTokenizer
@@ -105,7 +103,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] LlamaModel
     - forward
 
-
 ## LlamaForCausalLM
 
 [[autodoc]] LlamaForCausalLM
diff --git a/docs/source/en/model_doc/llama2.md b/docs/source/en/model_doc/llama2.md
index 0ff1e38f16a..a817a866c0f 100644
--- a/docs/source/en/model_doc/llama2.md
+++ b/docs/source/en/model_doc/llama2.md
@@ -24,7 +24,10 @@ The abstract from the paper is the following:
 
 *In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Our models outperform open-source chat models on most benchmarks we tested, and based on our human evaluations for helpfulness and safety, may be a suitable substitute for closed-source models. We provide a detailed description of our approach to fine-tuning and safety improvements of Llama 2-Chat in order to enable the community to build on our work and contribute to the responsible development of LLMs.*
 
-Checkout all Llama2 models [here](https://huggingface.co/models?search=llama2)
+Checkout all Llama2 model checkpoints [here](https://huggingface.co/models?search=llama2).
+This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ) with contributions from [Lysandre Debut](https://huggingface.co/lysandre). The code of the implementation in Hugging Face is based on GPT-NeoX [here](https://github.com/EleutherAI/gpt-neox). The original code of the authors can be found [here](https://github.com/facebookresearch/llama).
+
+## Usage tips
 
 <Tip warning={true}>
 
@@ -64,7 +67,6 @@ come in several checkpoints they each contain a part of each weight of the model
 
 - The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string.
 
-This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ) with contributions from [Lysandre Debut](https://huggingface.co/lysandre). The code of the implementation in Hugging Face is based on GPT-NeoX [here](https://github.com/EleutherAI/gpt-neox). The original code of the authors can be found [here](https://github.com/facebookresearch/llama).
 
 ## Resources
 
diff --git a/docs/source/en/model_doc/longformer.md b/docs/source/en/model_doc/longformer.md
index 9947195058c..20ba7a92251 100644
--- a/docs/source/en/model_doc/longformer.md
+++ b/docs/source/en/model_doc/longformer.md
@@ -41,15 +41,15 @@ contrast to most prior work, we also pretrain Longformer and finetune it on a va
 pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new state-of-the-art results on
 WikiHop and TriviaQA.*
 
-Tips:
+This model was contributed by [beltagy](https://huggingface.co/beltagy). The Authors' code can be found [here](https://github.com/allenai/longformer).
+
+## Usage tips
 
 - Since the Longformer is based on RoBERTa, it doesn't have `token_type_ids`. You don't need to indicate which
   token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or
   `</s>`).
 - A transformer model replacing the attention matrices by sparse matrices to go faster. Often, the local context (e.g., what are the two tokens left and right?) is enough to take action for a given token. Some preselected input tokens are still given global attention, but the attention matrix has way less parameters, resulting in a speed-up. See the local attention section for more information.
 
-This model was contributed by [beltagy](https://huggingface.co/beltagy). The Authors' code can be found [here](https://github.com/allenai/longformer).
-
 ## Longformer Self Attention
 
 Longformer self attention employs self attention on both a "local" context and a "global" context. Most tokens only
@@ -93,7 +93,7 @@ mlm_labels = tokenizer.encode("This is a sentence from the training data", retur
 loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]
 ```
 
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
@@ -143,6 +143,9 @@ loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]
 
 [[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerTokenClassifierOutput
 
+<frameworkcontent>
+<pt>
+
 ## LongformerModel
 
 [[autodoc]] LongformerModel
@@ -173,6 +176,9 @@ loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]
 [[autodoc]] LongformerForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFLongformerModel
 
 [[autodoc]] TFLongformerModel
@@ -202,3 +208,6 @@ loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]
 
 [[autodoc]] TFLongformerForMultipleChoice
     - call
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/longt5.md b/docs/source/en/model_doc/longt5.md
index e8dcfe3b237..40faa6d8c23 100644
--- a/docs/source/en/model_doc/longt5.md
+++ b/docs/source/en/model_doc/longt5.md
@@ -36,7 +36,10 @@ attention ideas from long-input transformers (ETC), and adopted pre-training str
 able to achieve state-of-the-art results on several summarization tasks and outperform the original T5 models on
 question answering tasks.*
 
-Tips:
+This model was contributed by [stancld](https://huggingface.co/stancld).
+The original code can be found [here](https://github.com/google-research/longt5).
+
+## Usage tips
 
 - [`LongT5ForConditionalGeneration`] is an extension of [`T5ForConditionalGeneration`] exchanging the traditional
 encoder *self-attention* layer with efficient either *local* attention or *transient-global* (*tglobal*) attention.
@@ -87,10 +90,8 @@ The complexity of this mechanism is `O(l(r + l/k))`.
 >>> rouge.compute(predictions=result["predicted_abstract"], references=result["abstract"])
 ```
 
-This model was contributed by [stancld](https://huggingface.co/stancld).
-The original code can be found [here](https://github.com/google-research/longt5).
 
-## Documentation resources
+## Resources
 
 - [Translation task guide](../tasks/translation)
 - [Summarization task guide](../tasks/summarization)
@@ -99,6 +100,9 @@ The original code can be found [here](https://github.com/google-research/longt5)
 
 [[autodoc]] LongT5Config
 
+<frameworkcontent>
+<pt>
+
 ## LongT5Model
 
 [[autodoc]] LongT5Model
@@ -114,6 +118,9 @@ The original code can be found [here](https://github.com/google-research/longt5)
 [[autodoc]] LongT5EncoderModel
     - forward
 
+</pt>
+<jax>
+
 ## FlaxLongT5Model
 
 [[autodoc]] FlaxLongT5Model
@@ -127,3 +134,6 @@ The original code can be found [here](https://github.com/google-research/longt5)
     - __call__
     - encode
     - decode
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/luke.md b/docs/source/en/model_doc/luke.md
index 2947c7c41bd..4e070b1c4ba 100644
--- a/docs/source/en/model_doc/luke.md
+++ b/docs/source/en/model_doc/luke.md
@@ -37,7 +37,9 @@ state-of-the-art results on five well-known datasets: Open Entity (entity typing
 CoNLL-2003 (named entity recognition), ReCoRD (cloze-style question answering), and SQuAD 1.1 (extractive question
 answering).*
 
-Tips:
+This model was contributed by [ikuyamada](https://huggingface.co/ikuyamada) and [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/studio-ousia/luke).
+
+## Usage tips
 
 - This implementation is the same as [`RobertaModel`] with the addition of entity embeddings as well
   as an entity-aware self-attention mechanism, which improves performance on tasks involving reasoning about entities.
@@ -75,13 +77,7 @@ Tips:
   head models by specifying `task="entity_classification"`, `task="entity_pair_classification"`, or
   `task="entity_span_classification"`. Please refer to the example code of each head models.
 
-  A demo notebook on how to fine-tune [`LukeForEntityPairClassification`] for relation
-  classification can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/LUKE).
-
-  There are also 3 notebooks available, which showcase how you can reproduce the results as reported in the paper with
-  the HuggingFace implementation of LUKE. They can be found [here](https://github.com/studio-ousia/luke/tree/master/notebooks).
-
-Example:
+Usage example:
 
 ```python
 >>> from transformers import LukeTokenizer, LukeModel, LukeForEntityPairClassification
@@ -119,10 +115,10 @@ Example:
 >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
 ```
 
-This model was contributed by [ikuyamada](https://huggingface.co/ikuyamada) and [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/studio-ousia/luke).
-
-## Documentation resources
+## Resources
 
+- [A demo notebook on how to fine-tune [`LukeForEntityPairClassification`] for relation classification](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/LUKE)
+- [Notebooks showcasing how you to reproduce the results as reported in the paper with the HuggingFace implementation of LUKE](https://github.com/studio-ousia/luke/tree/master/notebooks)
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
 - [Question answering task guide](../tasks/question_answering)
diff --git a/docs/source/en/model_doc/lxmert.md b/docs/source/en/model_doc/lxmert.md
index 114539f61e8..435994196b4 100644
--- a/docs/source/en/model_doc/lxmert.md
+++ b/docs/source/en/model_doc/lxmert.md
@@ -41,7 +41,9 @@ best result by 22% absolute (54% to 76%). Lastly, we demonstrate detailed ablati
 model components and pretraining strategies significantly contribute to our strong results; and also present several
 attention visualizations for the different encoders*
 
-Tips:
+This model was contributed by [eltoto1219](https://huggingface.co/eltoto1219). The original code can be found [here](https://github.com/airsplay/lxmert).
+
+## Usage tips
 
 - Bounding boxes are not necessary to be used in the visual feature embeddings, any kind of visual-spacial features
   will work.
@@ -53,9 +55,7 @@ Tips:
   contains self-attention for each respective modality and cross-attention, only the cross attention is returned and
   both self attention outputs are disregarded.
 
-This model was contributed by [eltoto1219](https://huggingface.co/eltoto1219). The original code can be found [here](https://github.com/airsplay/lxmert).
-
-## Documentation resources
+## Resources
 
 - [Question answering task guide](../tasks/question_answering)
 
@@ -83,6 +83,9 @@ This model was contributed by [eltoto1219](https://huggingface.co/eltoto1219). T
 
 [[autodoc]] models.lxmert.modeling_tf_lxmert.TFLxmertForPreTrainingOutput
 
+<frameworkcontent>
+<pt>
+
 ## LxmertModel
 
 [[autodoc]] LxmertModel
@@ -98,6 +101,9 @@ This model was contributed by [eltoto1219](https://huggingface.co/eltoto1219). T
 [[autodoc]] LxmertForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFLxmertModel
 
 [[autodoc]] TFLxmertModel
@@ -107,3 +113,6 @@ This model was contributed by [eltoto1219](https://huggingface.co/eltoto1219). T
 
 [[autodoc]] TFLxmertForPreTraining
     - call
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/m2m_100.md b/docs/source/en/model_doc/m2m_100.md
index c2b4354c6d5..fa808c2e94b 100644
--- a/docs/source/en/model_doc/m2m_100.md
+++ b/docs/source/en/model_doc/m2m_100.md
@@ -38,7 +38,7 @@ open-source our scripts so that others may reproduce the data, evaluation, and f
 This model was contributed by [valhalla](https://huggingface.co/valhalla).
 
 
-### Training and Generation
+## Usage tips and examples
 
 M2M100 is a multilingual encoder-decoder (seq-to-seq) model primarily intended for translation tasks. As the model is
 multilingual it expects the sequences in a certain format: A special language id token is used as prefix in both the
@@ -48,7 +48,7 @@ id for source text and target language id for target text, with `X` being the so
 The [`M2M100Tokenizer`] depends on `sentencepiece` so be sure to install it before running the
 examples. To install `sentencepiece` run `pip install sentencepiece`.
 
-- Supervised Training
+**Supervised Training**
 
 ```python
 from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer
@@ -64,12 +64,12 @@ model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
 loss = model(**model_inputs).loss  # forward pass
 ```
 
-- Generation
+**Generation**
 
-  M2M100 uses the `eos_token_id` as the `decoder_start_token_id` for generation with the target language id
-  being forced as the first generated token. To force the target language id as the first generated token, pass the
-  *forced_bos_token_id* parameter to the *generate* method. The following example shows how to translate between
-  Hindi to French and Chinese to English using the *facebook/m2m100_418M* checkpoint.
+M2M100 uses the `eos_token_id` as the `decoder_start_token_id` for generation with the target language id 
+being forced as the first generated token. To force the target language id as the first generated token, pass the 
+*forced_bos_token_id* parameter to the *generate* method. The following example shows how to translate between 
+Hindi to French and Chinese to English using the *facebook/m2m100_418M* checkpoint.
 
 ```python
 >>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
@@ -95,7 +95,7 @@ loss = model(**model_inputs).loss  # forward pass
 "Life is like a box of chocolate."
 ```
 
-## Documentation resources
+## Resources
 
 - [Translation task guide](../tasks/translation)
 - [Summarization task guide](../tasks/summarization)
diff --git a/docs/source/en/model_doc/marian.md b/docs/source/en/model_doc/marian.md
index 8be41686594..8078ea1427c 100644
--- a/docs/source/en/model_doc/marian.md
+++ b/docs/source/en/model_doc/marian.md
@@ -25,14 +25,11 @@ rendered properly in your Markdown viewer.
 </a>
 </div>
 
-**Bugs:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=sshleifer&labels=&template=bug-report.md&title)
-and assign @patrickvonplaten.
+## Overview
 
-Translations should be similar, but not identical to output in the test set linked to in each model card.
+A framework for translation models, using the same models as BART. Translations should be similar, but not identical to output in the test set linked to in each model card.
+This model was contributed by [sshleifer](https://huggingface.co/sshleifer).
 
-Tips:
-
-- A framework for translation models, using the same models as BART.
 
 ## Implementation Notes
 
@@ -49,7 +46,7 @@ Tips:
   - the model starts generating with `pad_token_id` (which has 0 as a token_embedding) as the prefix (Bart uses
     `<s/>`),
 - Code to bulk convert models can be found in `convert_marian_to_pytorch.py`.
-- This model was contributed by [sshleifer](https://huggingface.co/sshleifer).
+
 
 ## Naming
 
@@ -165,7 +162,7 @@ Example of translating english to many romance languages, using old-style 2 char
  'Y esto al español']
 ```
 
-## Documentation resources
+## Resources
 
 - [Translation task guide](../tasks/translation)
 - [Summarization task guide](../tasks/summarization)
@@ -180,6 +177,9 @@ Example of translating english to many romance languages, using old-style 2 char
 [[autodoc]] MarianTokenizer
     - build_inputs_with_special_tokens
 
+<frameworkcontent>
+<pt>
+
 ## MarianModel
 
 [[autodoc]] MarianModel
@@ -195,6 +195,9 @@ Example of translating english to many romance languages, using old-style 2 char
 [[autodoc]] MarianForCausalLM
     - forward
 
+</pt>
+<tf>
+
 ## TFMarianModel
 
 [[autodoc]] TFMarianModel
@@ -205,6 +208,9 @@ Example of translating english to many romance languages, using old-style 2 char
 [[autodoc]] TFMarianMTModel
     - call
 
+</tf>
+<jax>
+
 ## FlaxMarianModel
 
 [[autodoc]] FlaxMarianModel
@@ -214,3 +220,6 @@ Example of translating english to many romance languages, using old-style 2 char
 
 [[autodoc]] FlaxMarianMTModel
     - __call__
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/markuplm.md b/docs/source/en/model_doc/markuplm.md
index b286c4fc00c..8150892e63f 100644
--- a/docs/source/en/model_doc/markuplm.md
+++ b/docs/source/en/model_doc/markuplm.md
@@ -40,19 +40,19 @@ HTML/XML-based documents, where text and markup information is jointly pre-train
 pre-trained MarkupLM significantly outperforms the existing strong baseline models on several document understanding
 tasks. The pre-trained model and code will be publicly available.*
 
-Tips:
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/markuplm).
+
+## Usage tips
+
 - In addition to `input_ids`, [`~MarkupLMModel.forward`] expects 2 additional inputs, namely `xpath_tags_seq` and `xpath_subs_seq`.
 These are the XPATH tags and subscripts respectively for each token in the input sequence.
 - One can use [`MarkupLMProcessor`] to prepare all data for the model. Refer to the [usage guide](#usage-markuplmprocessor) for more info.
-- Demo notebooks can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/MarkupLM).
 
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/markuplm_architecture.jpg"
 alt="drawing" width="600"/> 
 
 <small> MarkupLM architecture. Taken from the <a href="https://arxiv.org/abs/2110.08518">original paper.</a> </small>
 
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/markuplm).
-
 ## Usage: MarkupLMProcessor
 
 The easiest way to prepare data for the model is to use [`MarkupLMProcessor`], which internally combines a feature extractor
@@ -197,8 +197,9 @@ all nodes and xpaths yourself, you can provide them directly to the processor. M
 dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'xpath_tags_seq', 'xpath_subs_seq'])
 ```
 
-## Documentation resources
+## Resources
 
+- [Demo notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/MarkupLM)
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
 - [Question answering task guide](../tasks/question_answering)
diff --git a/docs/source/en/model_doc/mask2former.md b/docs/source/en/model_doc/mask2former.md
index ddfa5da2ba2..bd5ab80728e 100644
--- a/docs/source/en/model_doc/mask2former.md
+++ b/docs/source/en/model_doc/mask2former.md
@@ -25,16 +25,17 @@ The abstract from the paper is the following:
 *Image segmentation groups pixels with different semantics, e.g., category or instance membership. Each choice
 of semantics defines a task. While only the semantics of each task differ, current research focuses on designing specialized architectures for each task. We present Masked-attention Mask Transformer (Mask2Former), a new architecture capable of addressing any image segmentation task (panoptic, instance or semantic). Its key components include masked attention, which extracts localized features by constraining cross-attention within predicted mask regions. In addition to reducing the research effort by at least three times, it outperforms the best specialized architectures by a significant margin on four popular datasets. Most notably, Mask2Former sets a new state-of-the-art for panoptic segmentation (57.8 PQ on COCO), instance segmentation (50.1 AP on COCO) and semantic segmentation (57.7 mIoU on ADE20K).*
 
-Tips:
-- Mask2Former uses the same preprocessing and postprocessing steps as [MaskFormer](maskformer). Use [`Mask2FormerImageProcessor`] or [`AutoImageProcessor`] to prepare images and optional targets for the model.
-- To get the final segmentation, depending on the task, you can call [`~Mask2FormerImageProcessor.post_process_semantic_segmentation`] or [`~Mask2FormerImageProcessor.post_process_instance_segmentation`] or [`~Mask2FormerImageProcessor.post_process_panoptic_segmentation`]. All three tasks can be solved using [`Mask2FormerForUniversalSegmentation`] output, panoptic segmentation accepts an optional `label_ids_to_fuse` argument to fuse instances of the target object/s (e.g. sky) together.
-
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/mask2former_architecture.jpg" alt="drawing" width="600"/>
 
 <small> Mask2Former architecture. Taken from the <a href="https://arxiv.org/abs/2112.01527">original paper.</a> </small>
 
 This model was contributed by [Shivalika Singh](https://huggingface.co/shivi) and [Alara Dirik](https://huggingface.co/adirik). The original code can be found [here](https://github.com/facebookresearch/Mask2Former).
 
+## Usage tips
+
+- Mask2Former uses the same preprocessing and postprocessing steps as [MaskFormer](maskformer). Use [`Mask2FormerImageProcessor`] or [`AutoImageProcessor`] to prepare images and optional targets for the model.
+- To get the final segmentation, depending on the task, you can call [`~Mask2FormerImageProcessor.post_process_semantic_segmentation`] or [`~Mask2FormerImageProcessor.post_process_instance_segmentation`] or [`~Mask2FormerImageProcessor.post_process_panoptic_segmentation`]. All three tasks can be solved using [`Mask2FormerForUniversalSegmentation`] output, panoptic segmentation accepts an optional `label_ids_to_fuse` argument to fuse instances of the target object/s (e.g. sky) together.
+
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Mask2Former.
@@ -44,16 +45,16 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it.
 The resource should ideally demonstrate something new instead of duplicating an existing resource.
 
+## Mask2FormerConfig
+
+[[autodoc]] Mask2FormerConfig
+
 ## MaskFormer specific outputs
 
 [[autodoc]] models.mask2former.modeling_mask2former.Mask2FormerModelOutput
 
 [[autodoc]] models.mask2former.modeling_mask2former.Mask2FormerForUniversalSegmentationOutput
 
-## Mask2FormerConfig
-
-[[autodoc]] Mask2FormerConfig
-
 ## Mask2FormerModel
 
 [[autodoc]] Mask2FormerModel
diff --git a/docs/source/en/model_doc/maskformer.md b/docs/source/en/model_doc/maskformer.md
index 4695e54857f..5566dec5859 100644
--- a/docs/source/en/model_doc/maskformer.md
+++ b/docs/source/en/model_doc/maskformer.md
@@ -31,7 +31,14 @@ The abstract from the paper is the following:
 
 *Modern approaches typically formulate semantic segmentation as a per-pixel classification task, while instance-level segmentation is handled with an alternative mask classification. Our key insight: mask classification is sufficiently general to solve both semantic- and instance-level segmentation tasks in a unified manner using the exact same model, loss, and training procedure. Following this observation, we propose MaskFormer, a simple mask classification model which predicts a set of binary masks, each associated with a single global class label prediction. Overall, the proposed mask classification-based method simplifies the landscape of effective approaches to semantic and panoptic segmentation tasks and shows excellent empirical results. In particular, we observe that MaskFormer outperforms per-pixel classification baselines when the number of classes is large. Our mask classification-based method outperforms both current state-of-the-art semantic (55.6 mIoU on ADE20K) and panoptic segmentation (52.7 PQ on COCO) models.*
 
-Tips:
+The figure below illustrates the architecture of MaskFormer. Taken from the [original paper](https://arxiv.org/abs/2107.06278).
+
+<img width="600" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/maskformer_architecture.png"/>
+
+This model was contributed by [francesco](https://huggingface.co/francesco). The original code can be found [here](https://github.com/facebookresearch/MaskFormer).
+
+## Usage tips
+
 -  MaskFormer's Transformer decoder is identical to the decoder of [DETR](detr). During training, the authors of DETR did find it helpful to use auxiliary losses in the decoder, especially to help the model output the correct number of objects of each class. If you set the parameter `use_auxilary_loss` of [`MaskFormerConfig`] to `True`, then prediction feedforward neural networks and Hungarian losses are added after each decoder layer (with the FFNs sharing parameters).
 - If you want to train the model in a distributed environment across multiple nodes, then one should update the
   `get_num_masks` function inside in the `MaskFormerLoss` class of `modeling_maskformer.py`. When training on multiple nodes, this should be
@@ -39,12 +46,6 @@ Tips:
 - One can use [`MaskFormerImageProcessor`] to prepare images for the model and optional targets for the model.
 - To get the final segmentation, depending on the task, you can call [`~MaskFormerImageProcessor.post_process_semantic_segmentation`] or [`~MaskFormerImageProcessor.post_process_panoptic_segmentation`]. Both tasks can be solved using [`MaskFormerForInstanceSegmentation`] output, panoptic segmentation accepts an optional `label_ids_to_fuse` argument to fuse instances of the target object/s (e.g. sky) together.
 
-The figure below illustrates the architecture of MaskFormer. Taken from the [original paper](https://arxiv.org/abs/2107.06278).
-
-<img width="600" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/maskformer_architecture.png"/>
-
-This model was contributed by [francesco](https://huggingface.co/francesco). The original code can be found [here](https://github.com/facebookresearch/MaskFormer).
-
 ## Resources
 
 <PipelineTag pipeline="image-segmentation"/>
diff --git a/docs/source/en/model_doc/matcha.md b/docs/source/en/model_doc/matcha.md
index 20c403413fe..d4ee3305936 100644
--- a/docs/source/en/model_doc/matcha.md
+++ b/docs/source/en/model_doc/matcha.md
@@ -67,4 +67,10 @@ from transformers.optimization import Adafactor, get_cosine_schedule_with_warmup
 
 optimizer = Adafactor(self.parameters(), scale_parameter=False, relative_step=False, lr=0.01, weight_decay=1e-05)
 scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=1000, num_training_steps=40000)
-```
\ No newline at end of file
+```
+
+<Tip>
+
+MatCha is a model that is trained using `Pix2Struct` architecture. You can find more information about `Pix2Struct` in the [Pix2Struct documentation](https://huggingface.co/docs/transformers/main/en/model_doc/pix2struct).
+
+</Tip>
\ No newline at end of file
diff --git a/docs/source/en/model_doc/mbart.md b/docs/source/en/model_doc/mbart.md
index 8a614dd5055..e7fc0bd53ef 100644
--- a/docs/source/en/model_doc/mbart.md
+++ b/docs/source/en/model_doc/mbart.md
@@ -25,8 +25,6 @@ rendered properly in your Markdown viewer.
 </a>
 </div>
 
-**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign
-@patrickvonplaten
 
 ## Overview of MBart
 
@@ -186,6 +184,9 @@ tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
 
 [[autodoc]] MBart50TokenizerFast
 
+<frameworkcontent>
+<pt>
+
 ## MBartModel
 
 [[autodoc]] MBartModel
@@ -207,6 +208,9 @@ tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
 [[autodoc]] MBartForCausalLM
     - forward
 
+</pt>
+<tf>
+
 ## TFMBartModel
 
 [[autodoc]] TFMBartModel
@@ -217,6 +221,9 @@ tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
 [[autodoc]] TFMBartForConditionalGeneration
     - call
 
+</tf>
+<jax>
+
 ## FlaxMBartModel
 
 [[autodoc]] FlaxMBartModel
@@ -244,3 +251,6 @@ tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
     - __call__
     - encode
     - decode
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/mctct.md b/docs/source/en/model_doc/mctct.md
index 72d4bedfac6..7cf1a68f12e 100644
--- a/docs/source/en/model_doc/mctct.md
+++ b/docs/source/en/model_doc/mctct.md
@@ -40,18 +40,16 @@ pseudo-labels for all languages, either from scratch or by fine-tuning. Experime
 Common Voice and unlabeled VoxPopuli datasets show that our recipe can yield a model with better
 performance for many languages that also transfers well to LibriSpeech.*
 
-
-
 This model was contributed by [cwkeam](https://huggingface.co/cwkeam). The original code can be found [here](https://github.com/flashlight/wav2letter/tree/main/recipes/mling_pl).
 
-## Documentation resources
+## Usage tips
+
+The PyTorch version of this model is only available in torch 1.9 and higher.
+
+## Resources
 
 - [Automatic speech recognition task guide](../tasks/asr)
 
-Tips:
-
-- The PyTorch version of this model is only available in torch 1.9 and higher.
-
 ## MCTCTConfig
 
 [[autodoc]] MCTCTConfig
@@ -70,7 +68,6 @@ Tips:
     - batch_decode
     - decode
 
-
 ## MCTCTModel
 
 [[autodoc]] MCTCTModel
diff --git a/docs/source/en/model_doc/mega.md b/docs/source/en/model_doc/mega.md
index d4d68b9becd..4ce62ca45a1 100644
--- a/docs/source/en/model_doc/mega.md
+++ b/docs/source/en/model_doc/mega.md
@@ -28,15 +28,17 @@ The abstract from the paper is the following:
 
  *The design choices in the Transformer attention mechanism, including weak inductive bias and quadratic computational complexity, have limited its application for modeling long sequences. In this paper, we introduce Mega, a simple, theoretically grounded, single-head gated attention mechanism equipped with (exponential) moving average to incorporate inductive bias of position-aware local dependencies into the position-agnostic attention mechanism. We further propose a variant of Mega that offers linear time and space complexity yet yields only minimal quality loss, by efficiently splitting the whole sequence into multiple chunks with fixed length. Extensive experiments on a wide range of sequence modeling benchmarks, including the Long Range Arena, neural machine translation, auto-regressive language modeling, and image and speech classification, show that Mega achieves significant improvements over other sequence models, including variants of Transformers and recent state space models. *
 
-Tips:
+This model was contributed by [mnaylor](https://huggingface.co/mnaylor).
+The original code can be found [here](https://github.com/facebookresearch/mega).
+
+
+## Usage tips
 
 - MEGA can perform quite well with relatively few parameters. See Appendix D in the MEGA paper for examples of architectural specs which perform well in various settings. If using MEGA as a decoder, be sure to set `bidirectional=False` to avoid errors with default bidirectional. 
 - Mega-chunk is a variant of mega that reduces time and spaces complexity from quadratic to linear. Utilize chunking with MegaConfig.use_chunking and control chunk size with MegaConfig.chunk_size 
 
-This model was contributed by [mnaylor](https://huggingface.co/mnaylor).
-The original code can be found [here](https://github.com/facebookresearch/mega).
 
-Implementation Notes:
+## Implementation Notes
 
 - The original implementation of MEGA had an inconsistent expectation of attention masks for padding and causal self-attention between the softmax attention and Laplace/squared ReLU method. This implementation addresses that inconsistency.
 - The original implementation did not include token type embeddings; this implementation adds support for these, with the option controlled by MegaConfig.add_token_type_embeddings
diff --git a/docs/source/en/model_doc/megatron-bert.md b/docs/source/en/model_doc/megatron-bert.md
index 88ccff23587..67000c8b843 100644
--- a/docs/source/en/model_doc/megatron-bert.md
+++ b/docs/source/en/model_doc/megatron-bert.md
@@ -40,7 +40,11 @@ achieve SOTA results on the WikiText103 (10.8 compared to SOTA perplexity of 15.
 accuracy of 63.2%) datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9% compared to SOTA accuracy
 of 89.4%).*
 
-Tips:
+This model was contributed by [jdemouth](https://huggingface.co/jdemouth). The original code can be found [here](https://github.com/NVIDIA/Megatron-LM). 
+That repository contains a multi-GPU and multi-node implementation of the Megatron Language models. In particular, 
+it contains a hybrid model parallel approach using "tensor parallel" and "pipeline parallel" techniques.
+
+## Usage tips
 
 We have provided pretrained [BERT-345M](https://ngc.nvidia.com/catalog/models/nvidia:megatron_bert_345m) checkpoints
 for use to evaluate or finetuning downstream tasks.
@@ -78,11 +82,7 @@ python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpo
 python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_cased.zip
 ```
 
-This model was contributed by [jdemouth](https://huggingface.co/jdemouth). The original code can be found [here](https://github.com/NVIDIA/Megatron-LM). That repository contains a multi-GPU and multi-node implementation of the
-Megatron Language models. In particular, it contains a hybrid model parallel approach using "tensor parallel" and
-"pipeline parallel" techniques.
-
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
diff --git a/docs/source/en/model_doc/megatron_gpt2.md b/docs/source/en/model_doc/megatron_gpt2.md
index 1eea7d82bf3..284fd372c0e 100644
--- a/docs/source/en/model_doc/megatron_gpt2.md
+++ b/docs/source/en/model_doc/megatron_gpt2.md
@@ -40,7 +40,11 @@ achieve SOTA results on the WikiText103 (10.8 compared to SOTA perplexity of 15.
 accuracy of 63.2%) datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9% compared to SOTA accuracy
 of 89.4%).*
 
-Tips:
+This model was contributed by [jdemouth](https://huggingface.co/jdemouth). The original code can be found [here](https://github.com/NVIDIA/Megatron-LM). 
+That repository contains a multi-GPU and multi-node implementation of the Megatron Language models. In particular, it 
+contains a hybrid model parallel approach using "tensor parallel" and "pipeline parallel" techniques.
+
+## Usage tips
 
 We have provided pretrained [GPT2-345M](https://ngc.nvidia.com/catalog/models/nvidia:megatron_lm_345m) checkpoints
 for use to evaluate or finetuning downstream tasks.
@@ -65,7 +69,9 @@ The following command allows you to do the conversion. We assume that the folder
 python3 $PATH_TO_TRANSFORMERS/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py megatron_gpt2_345m_v0_0.zip
 ```
 
-This model was contributed by [jdemouth](https://huggingface.co/jdemouth). The original code can be found [here](https://github.com/NVIDIA/Megatron-LM). That repository contains a multi-GPU and multi-node implementation of the
-Megatron Language models. In particular, it contains a hybrid model parallel approach using "tensor parallel" and
-"pipeline parallel" techniques.
+<Tip> 
 
+ MegatronGPT2 architecture is the same as OpenAI GPT-2 . Refer to [GPT-2 documentation](gpt2) for information on 
+ configuration classes and their parameters.  
+
+ </Tip>
\ No newline at end of file
diff --git a/docs/source/en/model_doc/mgp-str.md b/docs/source/en/model_doc/mgp-str.md
index e384c062017..5a44a18b349 100644
--- a/docs/source/en/model_doc/mgp-str.md
+++ b/docs/source/en/model_doc/mgp-str.md
@@ -29,12 +29,10 @@ alt="drawing" width="600"/>
 
 <small> MGP-STR architecture. Taken from the <a href="https://arxiv.org/abs/2209.03592">original paper</a>. </small>
 
-Tips:
+MGP-STR is trained on two synthetic datasets [MJSynth]((http://www.robots.ox.ac.uk/~vgg/data/text/)) (MJ) and SynthText(http://www.robots.ox.ac.uk/~vgg/data/scenetext/) (ST) without fine-tuning on other datasets. It achieves state-of-the-art results on six standard Latin scene text benchmarks, including 3 regular text datasets (IC13, SVT, IIIT) and 3 irregular ones (IC15, SVTP, CUTE).
+This model was contributed by [yuekun](https://huggingface.co/yuekun). The original code can be found [here](https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/OCR/MGP-STR).
 
-- MGP-STR is trained on two synthetic datasets [MJSynth]((http://www.robots.ox.ac.uk/~vgg/data/text/)) (MJ) and SynthText(http://www.robots.ox.ac.uk/~vgg/data/scenetext/) (ST) without fine-tuning on other datasets. It achieves state-of-the-art results on six standard Latin scene text benchmarks, including 3 regular text datasets (IC13, SVT, IIIT) and 3 irregular ones (IC15, SVTP, CUTE).
-- This model was contributed by [yuekun](https://huggingface.co/yuekun). The original code can be found [here](https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/OCR/MGP-STR).
-
-## Inference
+## Inference example
 
 [`MgpstrModel`] accepts images as input and generates three types of predictions, which represent textual information at different granularities.
 The three types of predictions are fused to give the final prediction result.
@@ -46,7 +44,7 @@ into a single instance to both extract the input features and decode the predict
 
 - Step-by-step Optical Character Recognition (OCR)
 
-``` py
+```py
 >>> from transformers import MgpstrProcessor, MgpstrForSceneTextRecognition
 >>> import requests
 >>> from PIL import Image
diff --git a/docs/source/en/model_doc/mistral.md b/docs/source/en/model_doc/mistral.md
index 5972f72a614..8e37bc2caf8 100644
--- a/docs/source/en/model_doc/mistral.md
+++ b/docs/source/en/model_doc/mistral.md
@@ -18,9 +18,9 @@ rendered properly in your Markdown viewer.
 
 ## Overview
 
-Mistral-7B-v0.1 is Mistral AI’s first Large Language Model (LLM). 
+Mistral-7B-v0.1 is Mistral AI's first Large Language Model (LLM). 
 
-## Model Details
+### Model Details
 
 Mistral-7B-v0.1 is a decoder-based LM with the following architectural choices:
 * Sliding Window Attention - Trained with 8k context length and fixed cache size, with a theoretical attention span of 128K tokens
@@ -31,11 +31,11 @@ We also provide an instruction fine-tuned model: `Mistral-7B-Instruct-v0.1` whic
 
 For more details please read our [release blog post](https://mistral.ai/news/announcing-mistral-7b/)
 
-## License
+### License
 
 Both `Mistral-7B-v0.1` and `Mistral-7B-Instruct-v0.1` are released under the Apache 2.0 license.
 
-## Usage
+## Usage tips
 
 `Mistral-7B-v0.1` and `Mistral-7B-Instruct-v0.1` can be found on the [Huggingface Hub](https://huggingface.co/mistralai)
 
diff --git a/docs/source/en/model_doc/mluke.md b/docs/source/en/model_doc/mluke.md
index ec9430848ce..719af76ad44 100644
--- a/docs/source/en/model_doc/mluke.md
+++ b/docs/source/en/model_doc/mluke.md
@@ -37,6 +37,10 @@ representations into the input allows us to extract more language-agnostic featu
 multilingual cloze prompt task with the mLAMA dataset. We show that entity-based prompt elicits correct factual
 knowledge more likely than using only word representations.*
 
+This model was contributed by [ryo0634](https://huggingface.co/ryo0634). The original code can be found [here](https://github.com/studio-ousia/luke).
+
+## Usage tips
+
 One can directly plug in the weights of mLUKE into a LUKE model, like so:
 
 ```python
@@ -53,10 +57,12 @@ from transformers import MLukeTokenizer
 tokenizer = MLukeTokenizer.from_pretrained("studio-ousia/mluke-base")
 ```
 
+<Tip>
+
 As mLUKE's architecture is equivalent to that of LUKE, one can refer to [LUKE's documentation page](luke) for all
 tips, code examples and notebooks.
 
-This model was contributed by [ryo0634](https://huggingface.co/ryo0634). The original code can be found [here](https://github.com/studio-ousia/luke).
+</Tip>
 
 ## MLukeTokenizer
 
diff --git a/docs/source/en/model_doc/mms.md b/docs/source/en/model_doc/mms.md
index 497eb40d7e3..aefdbfd889f 100644
--- a/docs/source/en/model_doc/mms.md
+++ b/docs/source/en/model_doc/mms.md
@@ -306,7 +306,6 @@ with torch.no_grad():
    outputs = model(**inputs)
 ```
 
-
 ### Language Identification (LID)
 
 Different LID models are available based on the number of languages they can recognize - [126](https://huggingface.co/facebook/mms-lid-126), [256](https://huggingface.co/facebook/mms-lid-256), [512](https://huggingface.co/facebook/mms-lid-512), [1024](https://huggingface.co/facebook/mms-lid-1024), [2048](https://huggingface.co/facebook/mms-lid-2048), [4017](https://huggingface.co/facebook/mms-lid-4017). 
@@ -378,4 +377,13 @@ processor.id2label.values()
 
 ### Audio Pretrained Models
 
-Pretrained models are available for two different sizes - [300M](https://huggingface.co/facebook/mms-300m) , [1Bil](https://huggingface.co/facebook/mms-1b). The architecture is based on the Wav2Vec2 model, so one can refer to [Wav2Vec2's documentation page](wav2vec2) for further details on how to finetune with models for various downstream tasks.
+Pretrained models are available for two different sizes - [300M](https://huggingface.co/facebook/mms-300m) , 
+[1Bil](https://huggingface.co/facebook/mms-1b). 
+
+<Tip>
+
+The MMS for ASR architecture is based on the Wav2Vec2 model, refer to [Wav2Vec2's documentation page](wav2vec2) for further 
+details on how to finetune with models for various downstream tasks.
+
+MMS-TTS uses the same model architecture as VITS, refer to [VITS's documentation page](vits) for API reference.
+</Tip>
diff --git a/docs/source/en/model_doc/mobilebert.md b/docs/source/en/model_doc/mobilebert.md
index e652756351d..fbd9d34afb9 100644
--- a/docs/source/en/model_doc/mobilebert.md
+++ b/docs/source/en/model_doc/mobilebert.md
@@ -37,7 +37,9 @@ natural language inference tasks of GLUE, MobileBERT achieves a GLUEscore o 77.7
 latency on a Pixel 4 phone. On the SQuAD v1.1/v2.0 question answering task, MobileBERT achieves a dev F1 score of
 90.0/79.2 (1.5/2.1 higher than BERT_BASE).*
 
-Tips:
+This model was contributed by [vshampor](https://huggingface.co/vshampor). The original code can be found [here](https://github.com/google-research/mobilebert).
+
+## Usage tips
 
 - MobileBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather
   than the left.
@@ -45,9 +47,8 @@ Tips:
   efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. Models trained
   with a causal language modeling (CLM) objective are better in that regard.
 
-This model was contributed by [vshampor](https://huggingface.co/vshampor). The original code can be found [here](https://github.com/google-research/mobilebert).
 
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
@@ -73,6 +74,9 @@ This model was contributed by [vshampor](https://huggingface.co/vshampor). The o
 
 [[autodoc]] models.mobilebert.modeling_tf_mobilebert.TFMobileBertForPreTrainingOutput
 
+<frameworkcontent>
+<pt>
+
 ## MobileBertModel
 
 [[autodoc]] MobileBertModel
@@ -113,6 +117,9 @@ This model was contributed by [vshampor](https://huggingface.co/vshampor). The o
 [[autodoc]] MobileBertForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFMobileBertModel
 
 [[autodoc]] TFMobileBertModel
@@ -152,3 +159,6 @@ This model was contributed by [vshampor](https://huggingface.co/vshampor). The o
 
 [[autodoc]] TFMobileBertForQuestionAnswering
     - call
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/mobilenet_v1.md b/docs/source/en/model_doc/mobilenet_v1.md
index 56743efe141..9f68035c63c 100644
--- a/docs/source/en/model_doc/mobilenet_v1.md
+++ b/docs/source/en/model_doc/mobilenet_v1.md
@@ -24,7 +24,9 @@ The abstract from the paper is the following:
 
 *We present a class of efficient models called MobileNets for mobile and embedded vision applications. MobileNets are based on a streamlined architecture that uses depth-wise separable convolutions to build light weight deep neural networks. We introduce two simple global hyper-parameters that efficiently trade off between latency and accuracy. These hyper-parameters allow the model builder to choose the right sized model for their application based on the constraints of the problem. We present extensive experiments on resource and accuracy tradeoffs and show strong performance compared to other popular models on ImageNet classification. We then demonstrate the effectiveness of MobileNets across a wide range of applications and use cases including object detection, finegrain classification, face attributes and large scale geo-localization.*
 
-Tips:
+This model was contributed by [matthijs](https://huggingface.co/Matthijs). The original code and weights can be found [here](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md).
+
+## Usage tips
 
 - The checkpoints are named **mobilenet\_v1\_*depth*\_*size***, for example **mobilenet\_v1\_1.0\_224**, where **1.0** is the depth multiplier (sometimes also referred to as "alpha" or the width multiplier) and **224** is the resolution of the input images the model was trained on.
 
@@ -46,8 +48,6 @@ Unsupported features:
 
 - It's common to extract the output from the pointwise layers at indices 5, 11, 12, 13 for downstream purposes. Using `output_hidden_states=True` returns the output from all intermediate layers. There is currently no way to limit this to specific layers.
 
-This model was contributed by [matthijs](https://huggingface.co/Matthijs). The original code and weights can be found [here](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md).
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with MobileNetV1.
diff --git a/docs/source/en/model_doc/mobilenet_v2.md b/docs/source/en/model_doc/mobilenet_v2.md
index bd4114dc71a..ff22231ae0c 100644
--- a/docs/source/en/model_doc/mobilenet_v2.md
+++ b/docs/source/en/model_doc/mobilenet_v2.md
@@ -26,7 +26,9 @@ The abstract from the paper is the following:
 
 *The MobileNetV2 architecture is based on an inverted residual structure where the input and output of the residual block are thin bottleneck layers opposite to traditional residual models which use expanded representations in the input an MobileNetV2 uses lightweight depthwise convolutions to filter features in the intermediate expansion layer. Additionally, we find that it is important to remove non-linearities in the narrow layers in order to maintain representational power. We demonstrate that this improves performance and provide an intuition that led to this design. Finally, our approach allows decoupling of the input/output domains from the expressiveness of the transformation, which provides a convenient framework for further analysis. We measure our performance on Imagenet classification, COCO object detection, VOC image segmentation. We evaluate the trade-offs between accuracy, and number of operations measured by multiply-adds (MAdd), as well as the number of parameters.*
 
-Tips:
+This model was contributed by [matthijs](https://huggingface.co/Matthijs). The original code and weights can be found [here for the main model](https://github.com/tensorflow/models/tree/master/research/slim/nets/mobilenet) and [here for DeepLabV3+](https://github.com/tensorflow/models/tree/master/research/deeplab).
+
+## Usage tips
 
 - The checkpoints are named **mobilenet\_v2\_*depth*\_*size***, for example **mobilenet\_v2\_1.0\_224**, where **1.0** is the depth multiplier (sometimes also referred to as "alpha" or the width multiplier) and **224** is the resolution of the input images the model was trained on.
 
@@ -50,8 +52,6 @@ Unsupported features:
 
 - The DeepLabV3+ segmentation head does not use the final convolution layer from the backbone, but this layer gets computed anyway. There is currently no way to tell [`MobileNetV2Model`] up to which layer it should run.
 
-This model was contributed by [matthijs](https://huggingface.co/Matthijs). The original code and weights can be found [here for the main model](https://github.com/tensorflow/models/tree/master/research/slim/nets/mobilenet) and [here for DeepLabV3+](https://github.com/tensorflow/models/tree/master/research/deeplab).
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with MobileNetV2.
diff --git a/docs/source/en/model_doc/mobilevit.md b/docs/source/en/model_doc/mobilevit.md
index 2d815795689..e724ffa380e 100644
--- a/docs/source/en/model_doc/mobilevit.md
+++ b/docs/source/en/model_doc/mobilevit.md
@@ -24,7 +24,9 @@ The abstract from the paper is the following:
 
 *Light-weight convolutional neural networks (CNNs) are the de-facto for mobile vision tasks. Their spatial inductive biases allow them to learn representations with fewer parameters across different vision tasks. However, these networks are spatially local. To learn global representations, self-attention-based vision trans-formers (ViTs) have been adopted. Unlike CNNs, ViTs are heavy-weight. In this paper, we ask the following question: is it possible to combine the strengths of CNNs and ViTs to build a light-weight and low latency network for mobile vision tasks? Towards this end, we introduce MobileViT, a light-weight and general-purpose vision transformer for mobile devices. MobileViT presents a different perspective for the global processing of information with transformers, i.e., transformers as convolutions. Our results show that MobileViT significantly outperforms CNN- and ViT-based networks across different tasks and datasets. On the ImageNet-1k dataset, MobileViT achieves top-1 accuracy of 78.4% with about 6 million parameters, which is 3.2% and 6.2% more accurate than MobileNetv3 (CNN-based) and DeIT (ViT-based) for a similar number of parameters. On the MS-COCO object detection task, MobileViT is 5.7% more accurate than MobileNetv3 for a similar number of parameters.*
 
-Tips:
+This model was contributed by [matthijs](https://huggingface.co/Matthijs). The TensorFlow version of the model was contributed by [sayakpaul](https://huggingface.co/sayakpaul). The original code and weights can be found [here](https://github.com/apple/ml-cvnets).
+
+## Usage tips
 
 - MobileViT is more like a CNN than a Transformer model. It does not work on sequence data but on batches of images. Unlike ViT, there are no embeddings. The backbone model outputs a feature map. You can follow [this tutorial](https://keras.io/examples/vision/mobilevit) for a lightweight introduction.
 - One can use [`MobileViTImageProcessor`] to prepare images for the model. Note that if you do your own preprocessing, the pretrained checkpoints expect images to be in BGR pixel order (not RGB).
@@ -58,9 +60,6 @@ with open(tflite_filename, "wb") as f:
   The resulting model will be just **about an MB** making it a good fit for mobile applications where resources and network
   bandwidth can be constrained.
 
-
-This model was contributed by [matthijs](https://huggingface.co/Matthijs). The TensorFlow version of the model was contributed by [sayakpaul](https://huggingface.co/sayakpaul). The original code and weights can be found [here](https://github.com/apple/ml-cvnets).
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with MobileViT.
@@ -91,6 +90,9 @@ If you're interested in submitting a resource to be included here, please feel f
     - preprocess
     - post_process_semantic_segmentation
 
+<frameworkcontent>
+<pt>
+
 ## MobileViTModel
 
 [[autodoc]] MobileViTModel
@@ -106,6 +108,9 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] MobileViTForSemanticSegmentation
     - forward
 
+</pt>
+<tf>
+
 ## TFMobileViTModel
 
 [[autodoc]] TFMobileViTModel
@@ -120,3 +125,6 @@ If you're interested in submitting a resource to be included here, please feel f
 
 [[autodoc]] TFMobileViTForSemanticSegmentation
     - call
+
+</tf>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/en/model_doc/mobilevitv2.md b/docs/source/en/model_doc/mobilevitv2.md
index 4b6689ef2b4..c3a650fc704 100644
--- a/docs/source/en/model_doc/mobilevitv2.md
+++ b/docs/source/en/model_doc/mobilevitv2.md
@@ -26,17 +26,16 @@ The abstract from the paper is the following:
 
 *Mobile vision transformers (MobileViT) can achieve state-of-the-art performance across several mobile vision tasks, including classification and detection. Though these models have fewer parameters, they have high latency as compared to convolutional neural network-based models. The main efficiency bottleneck in MobileViT is the multi-headed self-attention (MHA) in transformers, which requires O(k2) time complexity with respect to the number of tokens (or patches) k. Moreover, MHA requires costly operations (e.g., batch-wise matrix multiplication) for computing self-attention, impacting latency on resource-constrained devices. This paper introduces a separable self-attention method with linear complexity, i.e. O(k). A simple yet effective characteristic of the proposed method is that it uses element-wise operations for computing self-attention, making it a good choice for resource-constrained devices. The improved model, MobileViTV2, is state-of-the-art on several mobile vision tasks, including ImageNet object classification and MS-COCO object detection. With about three million parameters, MobileViTV2 achieves a top-1 accuracy of 75.6% on the ImageNet dataset, outperforming MobileViT by about 1% while running 3.2× faster on a mobile device.*
 
-Tips:
+This model was contributed by [shehan97](https://huggingface.co/shehan97).
+The original code can be found [here](https://github.com/apple/ml-cvnets).
+
+## Usage tips
 
 - MobileViTV2 is more like a CNN than a Transformer model. It does not work on sequence data but on batches of images. Unlike ViT, there are no embeddings. The backbone model outputs a feature map.
 - One can use [`MobileViTImageProcessor`] to prepare images for the model. Note that if you do your own preprocessing, the pretrained checkpoints expect images to be in BGR pixel order (not RGB).
 - The available image classification checkpoints are pre-trained on [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k) (also referred to as ILSVRC 2012, a collection of 1.3 million images and 1,000 classes).
 - The segmentation model uses a [DeepLabV3](https://arxiv.org/abs/1706.05587) head. The available semantic segmentation checkpoints are pre-trained on [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/).
 
-This model was contributed by [shehan97](https://huggingface.co/shehan97).
-The original code can be found [here](https://github.com/apple/ml-cvnets).
-
-
 ## MobileViTV2Config
 
 [[autodoc]] MobileViTV2Config
diff --git a/docs/source/en/model_doc/mpnet.md b/docs/source/en/model_doc/mpnet.md
index 97c140f631d..c571da47b00 100644
--- a/docs/source/en/model_doc/mpnet.md
+++ b/docs/source/en/model_doc/mpnet.md
@@ -37,14 +37,14 @@ down-streaming tasks (GLUE, SQuAD, etc). Experimental results show that MPNet ou
 margin, and achieves better results on these tasks compared with previous state-of-the-art pre-trained methods (e.g.,
 BERT, XLNet, RoBERTa) under the same model setting.*
 
-Tips:
-
-- MPNet doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. just
-  separate your segments with the separation token `tokenizer.sep_token` (or `[sep]`).
-
 The original code can be found [here](https://github.com/microsoft/MPNet).
 
-## Documentation resources
+## Usage tips
+
+MPNet doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just 
+separate your segments with the separation token `tokenizer.sep_token` (or `[sep]`).
+
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
@@ -68,6 +68,9 @@ The original code can be found [here](https://github.com/microsoft/MPNet).
 
 [[autodoc]] MPNetTokenizerFast
 
+<frameworkcontent>
+<pt>
+
 ## MPNetModel
 
 [[autodoc]] MPNetModel
@@ -98,6 +101,9 @@ The original code can be found [here](https://github.com/microsoft/MPNet).
 [[autodoc]] MPNetForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFMPNetModel
 
 [[autodoc]] TFMPNetModel
@@ -127,3 +133,6 @@ The original code can be found [here](https://github.com/microsoft/MPNet).
 
 [[autodoc]] TFMPNetForQuestionAnswering
     - call
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/mpt.md b/docs/source/en/model_doc/mpt.md
index fd0a3b5c46b..f7e6fcc1438 100644
--- a/docs/source/en/model_doc/mpt.md
+++ b/docs/source/en/model_doc/mpt.md
@@ -30,13 +30,14 @@ The original code is available at the  [`llm-foundry`](https://github.com/mosaic
 
 Read more about it [in the release blogpost](https://www.mosaicml.com/blog/mpt-7b)
 
-Tips:
+## Usage tips
 
 - Learn more about some techniques behind training of the model [in this section of llm-foundry repository](https://github.com/mosaicml/llm-foundry/blob/main/TUTORIAL.md#faqs)
 - If you want to use the advanced version of the model (triton kernels, direct flash attention integration), you can still use the original model implementation by adding `trust_remote_code=True` when calling `from_pretrained`.
 
-- [Fine-tuning Notebook](https://colab.research.google.com/drive/1HCpQkLL7UXW8xJUJJ29X7QAeNJKO0frZ?usp=sharing) on how to fine-tune MPT-7B on a free Google Colab instance to turn the model into a Chatbot.
+## Resources
 
+- [Fine-tuning Notebook](https://colab.research.google.com/drive/1HCpQkLL7UXW8xJUJJ29X7QAeNJKO0frZ?usp=sharing) on how to fine-tune MPT-7B on a free Google Colab instance to turn the model into a Chatbot.
 
 ## MptConfig
 
diff --git a/docs/source/en/model_doc/mra.md b/docs/source/en/model_doc/mra.md
index 8c1c392ead1..cc4c0d9cc9c 100644
--- a/docs/source/en/model_doc/mra.md
+++ b/docs/source/en/model_doc/mra.md
@@ -27,24 +27,20 @@ The abstract from the paper is the following:
 This model was contributed by [novice03](https://huggingface.co/novice03).
 The original code can be found [here](https://github.com/mlpen/mra-attention).
 
-
 ## MraConfig
 
 [[autodoc]] MraConfig
 
-
 ## MraModel
 
 [[autodoc]] MraModel
     - forward
 
-
 ## MraForMaskedLM
 
 [[autodoc]] MraForMaskedLM
     - forward
 
-
 ## MraForSequenceClassification
 
 [[autodoc]] MraForSequenceClassification
@@ -55,13 +51,11 @@ The original code can be found [here](https://github.com/mlpen/mra-attention).
 [[autodoc]] MraForMultipleChoice
     - forward
 
-
 ## MraForTokenClassification
 
 [[autodoc]] MraForTokenClassification
     - forward
 
-
 ## MraForQuestionAnswering
 
 [[autodoc]] MraForQuestionAnswering
diff --git a/docs/source/en/model_doc/mt5.md b/docs/source/en/model_doc/mt5.md
index beec9b53549..f7360092dec 100644
--- a/docs/source/en/model_doc/mt5.md
+++ b/docs/source/en/model_doc/mt5.md
@@ -60,7 +60,7 @@ Google has released the following variants:
 This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The original code can be
 found [here](https://github.com/google-research/multilingual-t5).
 
-## Documentation resources
+## Resources
 
 - [Translation task guide](../tasks/translation)
 - [Summarization task guide](../tasks/summarization)
@@ -82,6 +82,8 @@ See [`T5Tokenizer`] for all details.
 
 See [`T5TokenizerFast`] for all details.
 
+<frameworkcontent>
+<pt>
 
 ## MT5Model
 
@@ -103,6 +105,9 @@ See [`T5TokenizerFast`] for all details.
 
 [[autodoc]] MT5ForQuestionAnswering
 
+</pt>
+<tf>
+
 ## TFMT5Model
 
 [[autodoc]] TFMT5Model
@@ -115,6 +120,9 @@ See [`T5TokenizerFast`] for all details.
 
 [[autodoc]] TFMT5EncoderModel
 
+</tf>
+<jax>
+
 ## FlaxMT5Model
 
 [[autodoc]] FlaxMT5Model
@@ -126,3 +134,6 @@ See [`T5TokenizerFast`] for all details.
 ## FlaxMT5EncoderModel
 
 [[autodoc]] FlaxMT5EncoderModel
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/mvp.md b/docs/source/en/model_doc/mvp.md
index 043163f40b3..0d98e04cf09 100644
--- a/docs/source/en/model_doc/mvp.md
+++ b/docs/source/en/model_doc/mvp.md
@@ -28,15 +28,17 @@ According to the abstract,
 - MVP also has task-specific soft prompts to stimulate the model's capacity in performing a certain task.
 - MVP is specially designed for natural language generation and can be adapted to a wide range of generation tasks, including but not limited to summarization, data-to-text generation, open-ended dialogue system, story generation, question answering, question generation, task-oriented dialogue system, commonsense generation, paraphrase generation, text style transfer, and text simplification. Our model can also be adapted to natural language understanding tasks such as sequence classification and (extractive) question answering.
 
-Tips:
+This model was contributed by [Tianyi Tang](https://huggingface.co/StevenTang). The detailed information and instructions can be found [here](https://github.com/RUCAIBox/MVP).
+
+## Usage tips
+
 - We have released a series of models [here](https://huggingface.co/models?filter=mvp), including MVP, MVP with task-specific prompts, and multi-task pre-trained variants.
 - If you want to use a model without prompts (standard Transformer), you can load it through `MvpForConditionalGeneration.from_pretrained('RUCAIBox/mvp')`.
 - If you want to use a model with task-specific prompts, such as summarization, you can load it through `MvpForConditionalGeneration.from_pretrained('RUCAIBox/mvp-summarization')`.
 - Our model supports lightweight prompt tuning following [Prefix-tuning](https://arxiv.org/abs/2101.00190) with method `set_lightweight_tuning()`.
 
-This model was contributed by [Tianyi Tang](https://huggingface.co/StevenTang). The detailed information and instructions can be found [here](https://github.com/RUCAIBox/MVP).
+## Usage examples
 
-## Examples
 For summarization, it is an example to use MVP and MVP with summarization-specific prompts.
 
 ```python
@@ -104,7 +106,7 @@ For lightweight tuning, *i.e.*, fixing the model and only tuning prompts, you ca
 >>> model.set_lightweight_tuning()
 ```
 
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Question answering task guide](../tasks/question_answering)
diff --git a/docs/source/en/model_doc/nat.md b/docs/source/en/model_doc/nat.md
index 668951c241f..ecb61ccb0a3 100644
--- a/docs/source/en/model_doc/nat.md
+++ b/docs/source/en/model_doc/nat.md
@@ -36,7 +36,18 @@ that boosts image classification and downstream vision performance. Experimental
 NAT-Tiny reaches 83.2% top-1 accuracy on ImageNet, 51.4% mAP on MS-COCO and 48.4% mIoU on ADE20K, which is 1.9%
 ImageNet accuracy, 1.0% COCO mAP, and 2.6% ADE20K mIoU improvement over a Swin model with similar size. *
 
-Tips:
+<img
+src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/neighborhood-attention-pattern.jpg"
+alt="drawing" width="600"/>
+
+<small> Neighborhood Attention compared to other attention patterns.
+Taken from the <a href="https://arxiv.org/abs/2204.07143">original paper</a>.</small>
+
+This model was contributed by [Ali Hassani](https://huggingface.co/alihassanijr).
+The original code can be found [here](https://github.com/SHI-Labs/Neighborhood-Attention-Transformer).
+
+## Usage tips
+
 - One can use the [`AutoImageProcessor`] API to prepare images for the model.
 - NAT can be used as a *backbone*. When `output_hidden_states = True`,
 it will output both `hidden_states` and `reshaped_hidden_states`.
@@ -50,16 +61,6 @@ or build on your system by running `pip install natten`.
 Note that the latter will likely take time to compile. NATTEN does not support Windows devices yet.
 - Patch size of 4 is only supported at the moment.
 
-<img
-src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/neighborhood-attention-pattern.jpg"
-alt="drawing" width="600"/>
-
-<small> Neighborhood Attention compared to other attention patterns.
-Taken from the <a href="https://arxiv.org/abs/2204.07143">original paper</a>.</small>
-
-This model was contributed by [Ali Hassani](https://huggingface.co/alihassanijr).
-The original code can be found [here](https://github.com/SHI-Labs/Neighborhood-Attention-Transformer).
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with NAT.
@@ -75,7 +76,6 @@ If you're interested in submitting a resource to be included here, please feel f
 
 [[autodoc]] NatConfig
 
-
 ## NatModel
 
 [[autodoc]] NatModel
diff --git a/docs/source/en/model_doc/nezha.md b/docs/source/en/model_doc/nezha.md
index 9c136cdf066..872f576f128 100644
--- a/docs/source/en/model_doc/nezha.md
+++ b/docs/source/en/model_doc/nezha.md
@@ -35,7 +35,7 @@ and natural language inference (XNLI).*
 
 This model was contributed by [sijunhe](https://huggingface.co/sijunhe). The original code can be found [here](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/NEZHA-PyTorch).
 
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
diff --git a/docs/source/en/model_doc/nllb-moe.md b/docs/source/en/model_doc/nllb-moe.md
index a98266b2492..eb2b7a7da26 100644
--- a/docs/source/en/model_doc/nllb-moe.md
+++ b/docs/source/en/model_doc/nllb-moe.md
@@ -37,22 +37,24 @@ improvements to counteract overfitting while training on thousands of tasks. Cri
 a human-translated benchmark, Flores-200, and combined human evaluation with a novel toxicity benchmark covering all languages in Flores-200 to assess translation safety.
 Our model achieves an improvement of 44% BLEU relative to the previous state-of-the-art, laying important groundwork towards realizing a universal translation system.*
 
-Tips:
+This model was contributed by [Arthur Zucker](https://huggingface.co/ArtZucker).
+The original code can be found [here](https://github.com/facebookresearch/fairseq).
+
+## Usage tips
 
 - M2M100ForConditionalGeneration is the base model for both NLLB and NLLB MoE
 - The NLLB-MoE is very similar to the NLLB model, but it's feed forward layer is based on the implementation of SwitchTransformers.
 - The tokenizer is the same as the NLLB models.
 
-This model was contributed by [Arthur Zucker](https://huggingface.co/ArtZucker).
-The original code can be found [here](https://github.com/facebookresearch/fairseq).
-
 ## Implementation differences with SwitchTransformers
+
 The biggest difference is the way the tokens are routed. NLLB-MoE uses a `top-2-gate` which means that for each input, only the top two experts are selected based on the 
 highest predicted probabilities from the gating network, and the remaining experts are ignored. In `SwitchTransformers`, only the top-1 probabilities are computed, 
 which means that tokens have less probability of being forwarded. Moreover, if a token is not routed to any expert, `SwitchTransformers` still adds its unmodified hidden 
 states (kind of like a residual connection) while they are masked in `NLLB`'s top-2 routing mechanism. 
 
 ## Generating with NLLB-MoE
+
 The available checkpoints require around 350GB of storage. Make sure to use `accelerate` if you do not have enough RAM on your machine.
 
 While generating the target text set the `forced_bos_token_id` to the target language id. The following
@@ -99,7 +101,7 @@ See example below for a translation from romanian to german:
 >>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
 ```
 
-## Documentation resources
+## Resources
 
 - [Translation task guide](../tasks/translation)
 - [Summarization task guide](../tasks/summarization)
diff --git a/docs/source/en/model_doc/nllb.md b/docs/source/en/model_doc/nllb.md
index ec50716c73c..b0dffa185ec 100644
--- a/docs/source/en/model_doc/nllb.md
+++ b/docs/source/en/model_doc/nllb.md
@@ -16,8 +16,9 @@ rendered properly in your Markdown viewer.
 
 # NLLB
 
-**DISCLAIMER:** The default behaviour for the tokenizer has recently been fixed (and thus changed)!
+## Updated tokenizer behavior 
 
+**DISCLAIMER:** The default behaviour for the tokenizer was fixed and thus changed in April 2023.
 The previous version adds `[self.eos_token_id, self.cur_lang_code]` at the end of the token sequence for both target and source tokenization. This is wrong as the NLLB paper mentions (page 48, 6.1.1. Model Architecture) :
 
 *Note that we prefix the source sequence with the source language, as opposed to the target
@@ -56,7 +57,7 @@ Enabling the old behaviour can be done as follows:
 
 For more details, feel free to check the linked [PR](https://github.com/huggingface/transformers/pull/22313) and [Issue](https://github.com/huggingface/transformers/issues/19943).
 
-## Overview of NLLB
+## Overview
 
 The NLLB model was presented in [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by Marta R. Costa-jussà, James Cross, Onur Çelebi,
 Maha Elbayad, Kenneth Heafield, Kevin Heffernan, Elahe Kalbassi, Janice Lam, Daniel Licht, Jean Maillard, Anna Sun, Skyler Wang, Guillaume Wenzek, Al Youngblood, Bapi Akula,
@@ -131,7 +132,7 @@ See example below for a translation from romanian to german:
 UN-Chef sagt, es gibt keine militärische Lösung in Syrien
 ```
 
-## Documentation resources
+## Resources
 
 - [Translation task guide](../tasks/translation)
 - [Summarization task guide](../tasks/summarization)
diff --git a/docs/source/en/model_doc/nougat.md b/docs/source/en/model_doc/nougat.md
index 3fcb97a541b..a39e74eb213 100644
--- a/docs/source/en/model_doc/nougat.md
+++ b/docs/source/en/model_doc/nougat.md
@@ -33,7 +33,7 @@ alt="drawing" width="600"/>
 This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found
 [here](https://github.com/facebookresearch/nougat).
 
-Tips:
+## Usage tips
 
 - The quickest way to get started with Nougat is by checking the [tutorial
   notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Nougat), which show how to use the model
@@ -89,6 +89,12 @@ into a single instance to both extract the input features and decode the predict
 
 See the [model hub](https://huggingface.co/models?filter=nougat) to look for Nougat checkpoints.
 
+<Tip>
+
+The model is identical to [Donut](donut) in terms of architecture.
+
+</Tip>
+
 ## NougatImageProcessor
 
 [[autodoc]] NougatImageProcessor
diff --git a/docs/source/en/model_doc/nystromformer.md b/docs/source/en/model_doc/nystromformer.md
index 6434944aba8..185c4e1f011 100644
--- a/docs/source/en/model_doc/nystromformer.md
+++ b/docs/source/en/model_doc/nystromformer.md
@@ -37,7 +37,7 @@ favorably relative to other efficient self-attention methods. Our code is availa
 
 This model was contributed by [novice03](https://huggingface.co/novice03). The original code can be found [here](https://github.com/mlpen/Nystromformer).
 
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
diff --git a/docs/source/en/model_doc/oneformer.md b/docs/source/en/model_doc/oneformer.md
index 5f8f46e1529..97a6aa64f54 100644
--- a/docs/source/en/model_doc/oneformer.md
+++ b/docs/source/en/model_doc/oneformer.md
@@ -26,7 +26,14 @@ The abstract from the paper is the following:
 
 *Universal Image Segmentation is not a new concept. Past attempts to unify image segmentation in the last decades include scene parsing, panoptic segmentation, and, more recently, new panoptic architectures. However, such panoptic architectures do not truly unify image segmentation because they need to be trained individually on the semantic, instance, or panoptic segmentation to achieve the best performance. Ideally, a truly universal framework should be trained only once and achieve SOTA performance across all three image segmentation tasks. To that end, we propose OneFormer, a universal image segmentation framework that unifies segmentation with a multi-task train-once design. We first propose a task-conditioned joint training strategy that enables training on ground truths of each domain (semantic, instance, and panoptic segmentation) within a single multi-task training process. Secondly, we introduce a task token to condition our model on the task at hand, making our model task-dynamic to support multi-task training and inference. Thirdly, we propose using a query-text contrastive loss during training to establish better inter-task and inter-class distinctions. Notably, our single OneFormer model outperforms specialized Mask2Former models across all three segmentation tasks on ADE20k, CityScapes, and COCO, despite the latter being trained on each of the three tasks individually with three times the resources. With new ConvNeXt and DiNAT backbones, we observe even more performance improvement. We believe OneFormer is a significant step towards making image segmentation more universal and accessible.*
 
-Tips:
+The figure below illustrates the architecture of OneFormer. Taken from the [original paper](https://arxiv.org/abs/2211.06220).
+
+<img width="600" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/oneformer_architecture.png"/>
+
+This model was contributed by [Jitesh Jain](https://huggingface.co/praeclarumjj3). The original code can be found [here](https://github.com/SHI-Labs/OneFormer).
+
+## Usage tips
+
 -  OneFormer requires two inputs during inference: *image* and *task token*. 
 - During training, OneFormer only uses panoptic annotations.
 - If you want to train the model in a distributed environment across multiple nodes, then one should update the
@@ -35,12 +42,6 @@ Tips:
 - One can use [`OneFormerProcessor`] to prepare input images and task inputs for the model and optional targets for the model. [`OneformerProcessor`] wraps [`OneFormerImageProcessor`] and [`CLIPTokenizer`] into a single instance to both prepare the images and encode the task inputs.
 - To get the final segmentation, depending on the task, you can call [`~OneFormerProcessor.post_process_semantic_segmentation`] or [`~OneFormerImageProcessor.post_process_instance_segmentation`] or [`~OneFormerImageProcessor.post_process_panoptic_segmentation`]. All three tasks can be solved using [`OneFormerForUniversalSegmentation`] output, panoptic segmentation accepts an optional `label_ids_to_fuse` argument to fuse instances of the target object/s (e.g. sky) together.
 
-The figure below illustrates the architecture of OneFormer. Taken from the [original paper](https://arxiv.org/abs/2211.06220).
-
-<img width="600" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/oneformer_architecture.png"/>
-
-This model was contributed by [Jitesh Jain](https://huggingface.co/praeclarumjj3). The original code can be found [here](https://github.com/SHI-Labs/OneFormer).
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with OneFormer.
diff --git a/docs/source/en/model_doc/open-llama.md b/docs/source/en/model_doc/open-llama.md
index 9663170c408..01170e7e3be 100644
--- a/docs/source/en/model_doc/open-llama.md
+++ b/docs/source/en/model_doc/open-llama.md
@@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
 
 <Tip warning={true}>
 
-This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
 
 If you run into any issues running this model, please reinstall the last version that supported this model: v4.31.0.
 You can do so by running the following command: `pip install -U transformers==4.31.0`.
diff --git a/docs/source/en/model_doc/openai-gpt.md b/docs/source/en/model_doc/openai-gpt.md
index ff98930b576..1fbfbbcd89e 100644
--- a/docs/source/en/model_doc/openai-gpt.md
+++ b/docs/source/en/model_doc/openai-gpt.md
@@ -44,7 +44,12 @@ approach on a wide range of benchmarks for natural language understanding. Our g
 discriminatively trained models that use architectures specifically crafted for each task, significantly improving upon
 the state of the art in 9 out of the 12 tasks studied.*
 
-Tips:
+[Write With Transformer](https://transformer.huggingface.co/doc/gpt) is a webapp created and hosted by Hugging Face
+showcasing the generative capabilities of several models. GPT is one of them.
+
+This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/openai/finetune-transformer-lm).
+
+## Usage tips
 
 - GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
   the left.
@@ -52,10 +57,6 @@ Tips:
   token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as it can be
   observed in the *run_generation.py* example script.
 
-[Write With Transformer](https://transformer.huggingface.co/doc/gpt) is a webapp created and hosted by Hugging Face
-showcasing the generative capabilities of several models. GPT is one of them.
-
-This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/openai/finetune-transformer-lm).
 
 Note:
 
@@ -116,6 +117,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] models.openai.modeling_tf_openai.TFOpenAIGPTDoubleHeadsModelOutput
 
+<frameworkcontent>
+<pt>
+
 ## OpenAIGPTModel
 
 [[autodoc]] OpenAIGPTModel
@@ -136,6 +140,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] OpenAIGPTForSequenceClassification
     - forward
 
+</pt>
+<tf>
+
 ## TFOpenAIGPTModel
 
 [[autodoc]] TFOpenAIGPTModel
@@ -155,3 +162,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] TFOpenAIGPTForSequenceClassification
     - call
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/opt.md b/docs/source/en/model_doc/opt.md
index 332c63600ac..68da201f99b 100644
--- a/docs/source/en/model_doc/opt.md
+++ b/docs/source/en/model_doc/opt.md
@@ -25,13 +25,13 @@ The abstract from the paper is the following:
 
 *Large language models, which are often trained for hundreds of thousands of compute days, have shown remarkable capabilities for zero- and few-shot learning. Given their computational cost, these models are difficult to replicate without significant capital. For the few that are available through APIs, no access is granted to the full model weights, making them difficult to study. We present Open Pre-trained Transformers (OPT), a suite of decoder-only pre-trained transformers ranging from 125M to 175B parameters, which we aim to fully and responsibly share with interested researchers. We show that OPT-175B is comparable to GPT-3, while requiring only 1/7th the carbon footprint to develop. We are also releasing our logbook detailing the infrastructure challenges we faced, along with code for experimenting with all of the released models.*
 
+This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ), [Younes Belkada](https://huggingface.co/ybelkada), and [Patrick Von Platen](https://huggingface.co/patrickvonplaten).
+The original code can be found [here](https://github.com/facebookresearch/metaseq).
+
 Tips:
 - OPT has the same architecture as [`BartDecoder`].
 - Contrary to GPT2, OPT adds the EOS token `</s>` to the beginning of every prompt.
 
-This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ), [Younes Belkada](https://huggingface.co/ybelkada), and [Patrick Von Platen](https://huggingface.co/patrickvonplaten).
-The original code can be found [here](https://github.com/facebookresearch/metaseq).
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with OPT. If you're
@@ -66,6 +66,9 @@ The resource should ideally demonstrate something new instead of duplicating an
 
 [[autodoc]] OPTConfig
 
+<frameworkcontent>
+<pt>
+
 ## OPTModel
 
 [[autodoc]] OPTModel
@@ -76,16 +79,6 @@ The resource should ideally demonstrate something new instead of duplicating an
 [[autodoc]] OPTForCausalLM
     - forward
 
-## TFOPTModel
-
-[[autodoc]] TFOPTModel
-    - call
-
-## TFOPTForCausalLM
-
-[[autodoc]] TFOPTForCausalLM
-    - call
-
 ## OPTForSequenceClassification
 
 [[autodoc]] OPTForSequenceClassification
@@ -96,13 +89,31 @@ The resource should ideally demonstrate something new instead of duplicating an
 [[autodoc]] OPTForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
+## TFOPTModel
+
+[[autodoc]] TFOPTModel
+    - call
+
+## TFOPTForCausalLM
+
+[[autodoc]] TFOPTForCausalLM
+    - call
+
+</tf>
+<jax>
+
 ## FlaxOPTModel
 
 [[autodoc]] FlaxOPTModel
     - __call__
 
-
 ## FlaxOPTForCausalLM
 
 [[autodoc]] FlaxOPTForCausalLM
     - __call__
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/owlv2.md b/docs/source/en/model_doc/owlv2.md
index 73063c59350..12000af9ed4 100644
--- a/docs/source/en/model_doc/owlv2.md
+++ b/docs/source/en/model_doc/owlv2.md
@@ -24,11 +24,6 @@ The abstract from the paper is the following:
 
 *Open-vocabulary object detection has benefited greatly from pretrained vision-language models, but is still limited by the amount of available detection training data. While detection training data can be expanded by using Web image-text pairs as weak supervision, this has not been done at scales comparable to image-level pretraining. Here, we scale up detection data with self-training, which uses an existing detector to generate pseudo-box annotations on image-text pairs. Major challenges in scaling self-training are the choice of label space, pseudo-annotation filtering, and training efficiency. We present the OWLv2 model and OWL-ST self-training recipe, which address these challenges. OWLv2 surpasses the performance of previous state-of-the-art open-vocabulary detectors already at comparable training scales (~10M examples). However, with OWL-ST, we can scale to over 1B examples, yielding further large improvement: With an L/14 architecture, OWL-ST improves AP on LVIS rare classes, for which the model has seen no human box annotations, from 31.2% to 44.6% (43% relative improvement). OWL-ST unlocks Web-scale training for open-world localization, similar to what has been seen for image classification and language modelling.*
 
-Tips:
-
-- The architecture of OWLv2 is identical to [OWL-ViT](owlvit), however the object detection head now also includes an objectness classifier, which predicts the (query-agnostic) likelihood that a predicted box contains an object (as opposed to background). The objectness score can be used to rank or filter predictions independently of text queries.
-- Usage of OWLv2 is identical to [OWL-ViT](owlvit) with a new, updated image processor ([`Owlv2ImageProcessor`]).
-
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/owlv2_overview.png"
 alt="drawing" width="600"/>
 
@@ -37,13 +32,12 @@ alt="drawing" width="600"/>
 This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit).
 
-## Usage
+## Usage example
 
 OWLv2 is, just like its predecessor [OWL-ViT](owlvit), a zero-shot text-conditioned object detection model. OWL-ViT uses [CLIP](clip) as its multi-modal backbone, with a ViT-like Transformer to get visual features and a causal language model to get the text features. To use CLIP for detection, OWL-ViT removes the final token pooling layer of the vision model and attaches a lightweight classification and box head to each transformer output token. Open-vocabulary classification is enabled by replacing the fixed classification layer weights with the class-name embeddings obtained from the text model. The authors first train CLIP from scratch and fine-tune it end-to-end with the classification and box heads on standard detection datasets using a bipartite matching loss. One or multiple text queries per image can be used to perform zero-shot text-conditioned object detection.
 
 [`Owlv2ImageProcessor`] can be used to resize (or rescale) and normalize images for the model and [`CLIPTokenizer`] is used to encode the text. [`Owlv2Processor`] wraps [`Owlv2ImageProcessor`] and [`CLIPTokenizer`] into a single instance to both encode the text and prepare the images. The following example shows how to perform object detection using [`Owlv2Processor`] and [`Owlv2ForObjectDetection`].
 
-
 ```python
 >>> import requests
 >>> from PIL import Image
@@ -76,7 +70,15 @@ Detected a photo of a cat with confidence 0.665 at location [6.75, 38.97, 326.62
 
 ## Resources
 
-A demo notebook on using OWLv2 for zero- and one-shot (image-guided) object detection can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/OWLv2).
+- A demo notebook on using OWLv2 for zero- and one-shot (image-guided) object detection can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/OWLv2).
+- [Zero-shot object detection task guide](../tasks/zero_shot_object_detection)
+
+<Tip>
+
+The architecture of OWLv2 is identical to [OWL-ViT](owlvit), however the object detection head now also includes an objectness classifier, which predicts the (query-agnostic) likelihood that a predicted box contains an object (as opposed to background). The objectness score can be used to rank or filter predictions independently of text queries.
+Usage of OWLv2 is identical to [OWL-ViT](owlvit) with a new, updated image processor ([`Owlv2ImageProcessor`]).
+
+</Tip>
 
 ## Owlv2Config
 
diff --git a/docs/source/en/model_doc/owlvit.md b/docs/source/en/model_doc/owlvit.md
index 712d0f62d78..0ba26eeb37b 100644
--- a/docs/source/en/model_doc/owlvit.md
+++ b/docs/source/en/model_doc/owlvit.md
@@ -31,13 +31,12 @@ alt="drawing" width="600"/>
 
 This model was contributed by [adirik](https://huggingface.co/adirik). The original code can be found [here](https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit).
 
-## Usage
+## Usage tips
 
 OWL-ViT is a zero-shot text-conditioned object detection model. OWL-ViT uses [CLIP](clip) as its multi-modal backbone, with a ViT-like Transformer to get visual features and a causal language model to get the text features. To use CLIP for detection, OWL-ViT removes the final token pooling layer of the vision model and attaches a lightweight classification and box head to each transformer output token. Open-vocabulary classification is enabled by replacing the fixed classification layer weights with the class-name embeddings obtained from the text model. The authors first train CLIP from scratch and fine-tune it end-to-end with the classification and box heads on standard detection datasets using a bipartite matching loss. One or multiple text queries per image can be used to perform zero-shot text-conditioned object detection.
 
 [`OwlViTImageProcessor`] can be used to resize (or rescale) and normalize images for the model and [`CLIPTokenizer`] is used to encode the text. [`OwlViTProcessor`] wraps [`OwlViTImageProcessor`] and [`CLIPTokenizer`] into a single instance to both encode the text and prepare the images. The following example shows how to perform object detection using [`OwlViTProcessor`] and [`OwlViTForObjectDetection`].
 
-
 ```python
 >>> import requests
 >>> from PIL import Image
diff --git a/docs/source/en/model_doc/pegasus.md b/docs/source/en/model_doc/pegasus.md
index 14608aae31c..0622354e62d 100644
--- a/docs/source/en/model_doc/pegasus.md
+++ b/docs/source/en/model_doc/pegasus.md
@@ -25,9 +25,6 @@ rendered properly in your Markdown viewer.
 </a>
 </div>
 
-**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=sshleifer&labels=&template=bug-report.md&title)
-and assign @patrickvonplaten.
-
 
 ## Overview
 
@@ -42,13 +39,17 @@ According to the abstract,
 
 This model was contributed by [sshleifer](https://huggingface.co/sshleifer). The Authors' code can be found [here](https://github.com/google-research/pegasus).
 
-Tips:
+## Usage tips
 
 - Sequence-to-sequence model with the same encoder-decoder model architecture as BART. Pegasus is pre-trained jointly on two self-supervised objective functions: Masked Language Modeling (MLM) and a novel summarization specific pretraining objective, called Gap Sentence Generation (GSG).
 
   * MLM: encoder input tokens are randomly replaced by a mask tokens and have to be predicted by the encoder (like in BERT)
   * GSG: whole encoder input sentences are replaced by a second mask token and fed to the decoder, but which has a causal mask to hide the future words like a regular auto-regressive transformer decoder.
 
+- FP16 is not supported (help/ideas on this appreciated!).
+- The adafactor optimizer is recommended for pegasus fine-tuning.
+
+
 ## Checkpoints
 
 All the [checkpoints](https://huggingface.co/models?search=pegasus) are fine-tuned for summarization, besides
@@ -60,20 +61,11 @@ All the [checkpoints](https://huggingface.co/models?search=pegasus) are fine-tun
 - Full replication results and correctly pre-processed data can be found in this [Issue](https://github.com/huggingface/transformers/issues/6844#issue-689259666).
 - [Distilled checkpoints](https://huggingface.co/models?search=distill-pegasus) are described in this [paper](https://arxiv.org/abs/2010.13002).
 
-### Examples
-
-- [Script](https://github.com/huggingface/transformers/tree/main/examples/research_projects/seq2seq-distillation/finetune_pegasus_xsum.sh) to fine-tune pegasus
-  on the XSUM dataset. Data download instructions at [examples/pytorch/summarization/](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md).
-- FP16 is not supported (help/ideas on this appreciated!).
-- The adafactor optimizer is recommended for pegasus fine-tuning.
-
-
 ## Implementation Notes
 
 - All models are transformer encoder-decoders with 16 layers in each component.
 - The implementation is completely inherited from [`BartForConditionalGeneration`]
 - Some key configuration differences:
-
   - static, sinusoidal position embeddings
   - the model starts generating with pad_token_id (which has 0 token_embedding) as the prefix.
   - more beams are used (`num_beams=8`)
@@ -82,7 +74,6 @@ All the [checkpoints](https://huggingface.co/models?search=pegasus) are fine-tun
 - The code to convert checkpoints trained in the author's [repo](https://github.com/google-research/pegasus) can be
   found in `convert_pegasus_tf_to_pytorch.py`.
 
-
 ## Usage Example
 
 ```python
@@ -106,8 +97,10 @@ All the [checkpoints](https://huggingface.co/models?search=pegasus) are fine-tun
 ... )
 ```
 
-## Documentation resources
+## Resources
 
+- [Script](https://github.com/huggingface/transformers/tree/main/examples/research_projects/seq2seq-distillation/finetune_pegasus_xsum.sh) to fine-tune pegasus
+  on the XSUM dataset. Data download instructions at [examples/pytorch/summarization/](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md).
 - [Causal language modeling task guide](../tasks/language_modeling)
 - [Translation task guide](../tasks/translation)
 - [Summarization task guide](../tasks/summarization)
@@ -126,6 +119,9 @@ warning: `add_tokens` does not work at the moment.
 
 [[autodoc]] PegasusTokenizerFast
 
+<frameworkcontent>
+<pt>
+
 ## PegasusModel
 
 [[autodoc]] PegasusModel
@@ -141,6 +137,9 @@ warning: `add_tokens` does not work at the moment.
 [[autodoc]] PegasusForCausalLM
     - forward
 
+</pt>
+<tf>
+
 ## TFPegasusModel
 
 [[autodoc]] TFPegasusModel
@@ -151,6 +150,9 @@ warning: `add_tokens` does not work at the moment.
 [[autodoc]] TFPegasusForConditionalGeneration
     - call
 
+</tf>
+<jax>
+
 ## FlaxPegasusModel
 
 [[autodoc]] FlaxPegasusModel
@@ -164,3 +166,6 @@ warning: `add_tokens` does not work at the moment.
     - __call__
     - encode
     - decode
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/pegasus_x.md b/docs/source/en/model_doc/pegasus_x.md
index a0fd670fc7c..20af5731e90 100644
--- a/docs/source/en/model_doc/pegasus_x.md
+++ b/docs/source/en/model_doc/pegasus_x.md
@@ -26,10 +26,6 @@ The abstract from the paper is the following:
 
 *While large pretrained Transformer models have proven highly capable at tackling natural language tasks, handling long sequence inputs continues to be a significant challenge. One such task is long input summarization, where inputs are longer than the maximum input context of most pretrained models. Through an extensive set of experiments, we investigate what model architectural changes and pretraining paradigms can most efficiently adapt a pretrained Transformer for long input summarization. We find that a staggered, block-local Transformer with global encoder tokens strikes a good balance of performance and efficiency, and that an additional pretraining phase on long sequences meaningfully improves downstream summarization performance. Based on our findings, we introduce PEGASUS-X, an extension of the PEGASUS model with additional long input pretraining to handle inputs of up to 16K tokens. PEGASUS-X achieves strong performance on long input summarization tasks comparable with much larger models while adding few additional parameters and not requiring model parallelism to train.*
 
-Tips:
-
-* PEGASUS-X uses the same tokenizer as PEGASUS.
-
 This model was contributed by [zphang](<https://huggingface.co/zphang). The original code can be found [here](https://github.com/google-research/pegasus).
 
 ## Documentation resources
@@ -37,17 +33,21 @@ This model was contributed by [zphang](<https://huggingface.co/zphang). The orig
 - [Translation task guide](../tasks/translation)
 - [Summarization task guide](../tasks/summarization)
 
+<Tip>
+
+PEGASUS-X uses the same tokenizer as [PEGASUS](pegasus).
+
+</Tip>
+
 ## PegasusXConfig
 
 [[autodoc]] PegasusXConfig
 
-
 ## PegasusXModel
 
 [[autodoc]] PegasusXModel
     - forward
 
-
 ## PegasusXForConditionalGeneration
 
 [[autodoc]] PegasusXForConditionalGeneration
diff --git a/docs/source/en/model_doc/perceiver.md b/docs/source/en/model_doc/perceiver.md
index 97921baed2b..ee678c22f6f 100644
--- a/docs/source/en/model_doc/perceiver.md
+++ b/docs/source/en/model_doc/perceiver.md
@@ -81,7 +81,13 @@ alt="drawing" width="600"/>
 This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found
 [here](https://github.com/deepmind/deepmind-research/tree/master/perceiver).
 
-Tips:
+<Tip warning={true}>
+
+Perceiver does **not** work with `torch.nn.DataParallel` due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035)
+
+</Tip>
+
+## Resources
 
 - The quickest way to get started with the Perceiver is by checking the [tutorial
   notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Perceiver).
@@ -89,13 +95,6 @@ Tips:
 is implemented in the library. Note that the models available in the library only showcase some examples of what you can do
 with the Perceiver. There are many more use cases, including question answering, named-entity recognition, object detection,
 audio classification, video classification, etc.
-
-**Note**:
-
-- Perceiver does **not** work with `torch.nn.DataParallel` due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035)
-
-## Documentation resources
-
 - [Text classification task guide](../tasks/sequence_classification)
 - [Masked language modeling task guide](../tasks/masked_language_modeling)
 - [Image classification task guide](../tasks/image_classification)
diff --git a/docs/source/en/model_doc/persimmon.md b/docs/source/en/model_doc/persimmon.md
index cf13d070c62..fe9e66a0b71 100644
--- a/docs/source/en/model_doc/persimmon.md
+++ b/docs/source/en/model_doc/persimmon.md
@@ -26,6 +26,10 @@ The authors showcase their approach to model evaluation, focusing on practical t
 
 In terms of model details, the work outlines the architecture and training methodology of Persimmon-8B, providing insights into its design choices, sequence length, and dataset composition. The authors present a fast inference code that outperforms traditional implementations through operator fusion and CUDA graph utilization while maintaining code coherence. They express their anticipation of how the community will leverage this contribution to drive innovation, hinting at further upcoming releases as part of an ongoing series of developments.
 
+This model was contributed by [ArthurZ](https://huggingface.co/ArthurZ).
+The original code can be found [here](https://github.com/persimmon-ai-labs/adept-inference).
+
+## Usage tips
 
 <Tip warning={true}>
 
@@ -67,8 +71,6 @@ model = PersimmonForCausalLM.from_pretrained("/output/path")
 tokenizer = PersimmonTokenizer.from_pretrained("/output/path")
 ```
 
-This model was contributed by [ArthurZ](https://huggingface.co/ArthurZ).
-The original code can be found [here](https://github.com/persimmon-ai-labs/adept-inference).
 
 - Perismmon uses a `sentencepiece` based tokenizer, with a `Unigram` model. It supports bytefallback, which is only available in `tokenizers==0.14.0` for the fast tokenizer.
 The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece. The `chat` template will be updated with the templating functions in a follow up PR!
diff --git a/docs/source/en/model_doc/phobert.md b/docs/source/en/model_doc/phobert.md
index 5543a9b3541..30a50275476 100644
--- a/docs/source/en/model_doc/phobert.md
+++ b/docs/source/en/model_doc/phobert.md
@@ -28,7 +28,9 @@ best pre-trained multilingual model XLM-R (Conneau et al., 2020) and improves th
 Vietnamese-specific NLP tasks including Part-of-speech tagging, Dependency parsing, Named-entity recognition and
 Natural language inference.*
 
-Example of use:
+This model was contributed by [dqnguyen](https://huggingface.co/dqnguyen). The original code can be found [here](https://github.com/VinAIResearch/PhoBERT).
+
+## Usage example
 
 ```python
 >>> import torch
@@ -50,7 +52,12 @@ Example of use:
 >>> # phobert = TFAutoModel.from_pretrained("vinai/phobert-base")
 ```
 
-This model was contributed by [dqnguyen](https://huggingface.co/dqnguyen). The original code can be found [here](https://github.com/VinAIResearch/PhoBERT).
+<Tip> 
+
+PhoBERT implementation is the same as BERT, except for tokenization. Refer to [EART documentation](bert) for information on 
+configuration classes and their parameters. PhoBERT-specific tokenizer is documented below.  
+
+</Tip>
 
 ## PhobertTokenizer
 
diff --git a/docs/source/en/model_doc/pix2struct.md b/docs/source/en/model_doc/pix2struct.md
index b722a59b82e..8dc179f5f86 100644
--- a/docs/source/en/model_doc/pix2struct.md
+++ b/docs/source/en/model_doc/pix2struct.md
@@ -39,7 +39,6 @@ The original code can be found [here](https://github.com/google-research/pix2str
 - [Fine-tuning Notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_pix2struct.ipynb)
 - [All models](https://huggingface.co/models?search=pix2struct)
 
-
 ## Pix2StructConfig
 
 [[autodoc]] Pix2StructConfig
diff --git a/docs/source/en/model_doc/plbart.md b/docs/source/en/model_doc/plbart.md
index c9f50202148..61af52e54d0 100644
--- a/docs/source/en/model_doc/plbart.md
+++ b/docs/source/en/model_doc/plbart.md
@@ -16,10 +16,7 @@ rendered properly in your Markdown viewer.
 
 # PLBart
 
-**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign
-[@gchhablani](https://www.github.com/gchhablani).
-
-## Overview of PLBart
+## Overview
 
 The PLBART model was proposed in [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
 This is a BART-like model which can be used to perform code-summarization, code-generation, and code-translation tasks. The pre-trained model `plbart-base` has been trained using multilingual denoising task
@@ -40,7 +37,7 @@ even with limited annotations.*
 
 This model was contributed by [gchhablani](https://huggingface.co/gchhablani). The Authors' code can be found [here](https://github.com/wasiahmad/PLBART).
 
-### Training of PLBart
+## Usage examples
 
 PLBart is a multilingual encoder-decoder (sequence-to-sequence) model primarily intended for code-to-text, text-to-code, code-to-code tasks. As the
 model is multilingual it expects the sequences in a different format. A special language id token is added in both the
@@ -53,7 +50,7 @@ In cases where the language code is needed, the regular [`~PLBartTokenizer.__cal
 when you pass texts as the first argument or with the keyword argument `text`, and will encode target text format if
 it's passed with the `text_target` keyword argument.
 
-- Supervised training
+### Supervised training
 
 ```python
 >>> from transformers import PLBartForConditionalGeneration, PLBartTokenizer
@@ -65,7 +62,7 @@ it's passed with the `text_target` keyword argument.
 >>> model(**inputs)
 ```
 
-- Generation
+### Generation
 
   While generating the target text set the `decoder_start_token_id` to the target language id. The following
   example shows how to translate Python to English using the `uclanlp/plbart-python-en_XX` model.
@@ -82,7 +79,7 @@ it's passed with the `text_target` keyword argument.
 "Returns the maximum value of a b c."
 ```
 
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Causal language modeling task guide](../tasks/language_modeling)
diff --git a/docs/source/en/model_doc/poolformer.md b/docs/source/en/model_doc/poolformer.md
index 537c60bdbcf..823c4412485 100644
--- a/docs/source/en/model_doc/poolformer.md
+++ b/docs/source/en/model_doc/poolformer.md
@@ -28,8 +28,9 @@ The figure below illustrates the architecture of PoolFormer. Taken from the [ori
 
 <img width="600" src="https://user-images.githubusercontent.com/15921929/142746124-1ab7635d-2536-4a0e-ad43-b4fe2c5a525d.png"/>
 
+This model was contributed by [heytanay](https://huggingface.co/heytanay). The original code can be found [here](https://github.com/sail-sg/poolformer).
 
-Tips:
+## Usage tips
 
 - PoolFormer has a hierarchical architecture, where instead of Attention, a simple Average Pooling layer is present. All checkpoints of the model can be found on the [hub](https://huggingface.co/models?other=poolformer).
 - One can use [`PoolFormerImageProcessor`] to prepare images for the model.
@@ -43,8 +44,6 @@ Tips:
 | m36               | [6, 6, 18, 6] | [96, 192, 384, 768] | 56             | 82.1                  |
 | m48               | [8, 8, 24, 8] | [96, 192, 384, 768] | 73             | 82.5                  |
 
-This model was contributed by [heytanay](https://huggingface.co/heytanay). The original code can be found [here](https://github.com/sail-sg/poolformer).
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with PoolFormer.
diff --git a/docs/source/en/model_doc/pop2piano.md b/docs/source/en/model_doc/pop2piano.md
index 95fd83f1923..8e52eda70cc 100644
--- a/docs/source/en/model_doc/pop2piano.md
+++ b/docs/source/en/model_doc/pop2piano.md
@@ -32,7 +32,6 @@ is transformed to its waveform and passed to the encoder, which transforms it to
 uses these latent representations to generate token ids in an autoregressive way. Each token id corresponds to one of four 
 different token types: time, velocity, note and 'special'. The token ids are then decoded to their equivalent MIDI file.
 
-
 The abstract from the paper is the following:
 
 *Piano covers of pop music are enjoyed by many people. However, the
@@ -49,22 +48,21 @@ directly from pop audio without using melody and chord extraction
 modules. We show that Pop2Piano, trained with our dataset, is capable
 of producing plausible piano covers.*
 
+This model was contributed by [Susnato Dhar](https://huggingface.co/susnato).
+The original code can be found [here](https://github.com/sweetcocoa/pop2piano).
 
-Tips:
+## Usage tips
 
-1. To use Pop2Piano, you will need to install the 🤗 Transformers library, as well as the following third party modules:  
+* To use Pop2Piano, you will need to install the 🤗 Transformers library, as well as the following third party modules:  
 ```
 pip install pretty-midi==0.2.9 essentia==2.1b6.dev1034 librosa scipy
 ```
 Please note that you may need to restart your runtime after installation.
-2. Pop2Piano is an Encoder-Decoder based model like T5.
-3. Pop2Piano can be used to generate midi-audio files for a given audio sequence.
-4. Choosing different composers in `Pop2PianoForConditionalGeneration.generate()` can lead to variety of different results.
-5. Setting the sampling rate to 44.1 kHz when loading the audio file can give good performance.
-6. Though Pop2Piano was mainly trained on Korean Pop music, it also does pretty well on other Western Pop or Hip Hop songs.
-
-This model was contributed by [Susnato Dhar](https://huggingface.co/susnato).
-The original code can be found [here](https://github.com/sweetcocoa/pop2piano).
+* Pop2Piano is an Encoder-Decoder based model like T5.
+* Pop2Piano can be used to generate midi-audio files for a given audio sequence.
+* Choosing different composers in `Pop2PianoForConditionalGeneration.generate()` can lead to variety of different results.
+* Setting the sampling rate to 44.1 kHz when loading the audio file can give good performance.
+* Though Pop2Piano was mainly trained on Korean Pop music, it also does pretty well on other Western Pop or Hip Hop songs.
 
 ## Examples
 
diff --git a/docs/source/en/model_doc/prophetnet.md b/docs/source/en/model_doc/prophetnet.md
index 6ab0937da77..7e63e0c0887 100644
--- a/docs/source/en/model_doc/prophetnet.md
+++ b/docs/source/en/model_doc/prophetnet.md
@@ -25,10 +25,6 @@ rendered properly in your Markdown viewer.
 </a>
 </div>
 
-
-**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign
-@patrickvonplaten
-
 ## Overview
 
 The ProphetNet model was proposed in [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei
@@ -49,15 +45,15 @@ dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Giga
 abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
 state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.*
 
-Tips:
+The Authors' code can be found [here](https://github.com/microsoft/ProphetNet).
+
+## Usage tips
 
 - ProphetNet is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
   the left.
 - The model architecture is based on the original Transformer, but replaces the “standard” self-attention mechanism in the decoder by a a main self-attention mechanism and a self and n-stream (predict) self-attention mechanism.
 
-The Authors' code can be found [here](https://github.com/microsoft/ProphetNet).
-
-## Documentation resources
+## Resources
 
 - [Causal language modeling task guide](../tasks/language_modeling)
 - [Translation task guide](../tasks/translation)
diff --git a/docs/source/en/model_doc/qdqbert.md b/docs/source/en/model_doc/qdqbert.md
index 62a0e010843..9ee42ff3b49 100644
--- a/docs/source/en/model_doc/qdqbert.md
+++ b/docs/source/en/model_doc/qdqbert.md
@@ -32,22 +32,18 @@ by processors with high-throughput integer math pipelines. We also present a wor
 able to maintain accuracy within 1% of the floating-point baseline on all networks studied, including models that are
 more difficult to quantize, such as MobileNets and BERT-large.*
 
-Tips:
+This model was contributed by [shangz](https://huggingface.co/shangz).
+
+## Usage tips
 
 - QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to (i) linear layer
   inputs and weights, (ii) matmul inputs, (iii) residual add inputs, in BERT model.
-
 - QDQBERT requires the dependency of [Pytorch Quantization Toolkit](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization). To install `pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com`
-
 - QDQBERT model can be loaded from any checkpoint of HuggingFace BERT model (for example *bert-base-uncased*), and
   perform Quantization Aware Training/Post Training Quantization.
-
 - A complete example of using QDQBERT model to perform Quatization Aware Training and Post Training Quantization for
   SQUAD task can be found at [transformers/examples/research_projects/quantization-qdqbert/](examples/research_projects/quantization-qdqbert/).
 
-This model was contributed by [shangz](https://huggingface.co/shangz).
-
-
 ### Set default quantizers
 
 QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to BERT by
@@ -118,7 +114,7 @@ the instructions in [torch.onnx](https://pytorch.org/docs/stable/onnx.html). Exa
 >>> torch.onnx.export(...)
 ```
 
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
diff --git a/docs/source/en/model_doc/rag.md b/docs/source/en/model_doc/rag.md
index b467c6169f6..1891efe7426 100644
--- a/docs/source/en/model_doc/rag.md
+++ b/docs/source/en/model_doc/rag.md
@@ -52,8 +52,12 @@ parametric-only seq2seq baseline.*
 
 This model was contributed by [ola13](https://huggingface.co/ola13).
 
-Tips:
-- Retrieval-augmented generation (“RAG”) models combine the powers of pretrained dense retrieval (DPR) and Seq2Seq models. RAG models retrieve docs, pass them to a seq2seq model, then marginalize to generate outputs. The retriever and seq2seq modules are initialized from pretrained models, and fine-tuned jointly, allowing both retrieval and generation to adapt to downstream tasks.
+## Usage tips
+
+Retrieval-augmented generation ("RAG") models combine the powers of pretrained dense retrieval (DPR) and Seq2Seq models. 
+RAG models retrieve docs, pass them to a seq2seq model, then marginalize to generate outputs. The retriever and seq2seq 
+modules are initialized from pretrained models, and fine-tuned jointly, allowing both retrieval and generation to adapt 
+to downstream tasks.
 
 ## RagConfig
 
@@ -73,6 +77,9 @@ Tips:
 
 [[autodoc]] RagRetriever
 
+<frameworkcontent>
+<pt>
+
 ## RagModel
 
 [[autodoc]] RagModel
@@ -90,6 +97,9 @@ Tips:
     - forward
     - generate
 
+</pt>
+<tf>
+
 ## TFRagModel
 
 [[autodoc]] TFRagModel
@@ -106,3 +116,6 @@ Tips:
 [[autodoc]] TFRagTokenForGeneration
     - call
     - generate
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/reformer.md b/docs/source/en/model_doc/reformer.md
index 05274c7667b..ec924dc50c4 100644
--- a/docs/source/en/model_doc/reformer.md
+++ b/docs/source/en/model_doc/reformer.md
@@ -25,8 +25,6 @@ rendered properly in your Markdown viewer.
 </a>
 </div>
 
-**DISCLAIMER:** This model is still a work in progress, if you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title).
-
 ## Overview
 
 The Reformer model was proposed in the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451.pdf) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
@@ -44,7 +42,7 @@ while being much more memory-efficient and much faster on long sequences.*
 This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be
 found [here](https://github.com/google/trax/tree/master/trax/models/reformer).
 
-Tips:
+## Usage tips
 
 - Reformer does **not** work with *torch.nn.DataParallel* due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035).
 - Use Axial position encoding (see below for more details). It’s a mechanism to avoid having a huge positional encoding matrix (when the sequence length is very big) by factorizing it into smaller matrices.
@@ -52,7 +50,7 @@ Tips:
 - Avoid storing the intermediate results of each layer by using reversible transformer layers to obtain them during the backward pass (subtracting the residuals from the input of the next layer gives them back) or recomputing them for results inside a given layer (less efficient than storing them but saves memory).
 - Compute the feedforward operations by chunks and not on the whole batch.
 
-## Axial Positional Encodings
+### Axial Positional Encodings
 
 Axial Positional Encodings were first implemented in Google's [trax library](https://github.com/google/trax/blob/4d99ad4965bab1deba227539758d59f0df0fef48/trax/layers/research/position_encodings.py#L29)
 and developed by the authors of this model's paper. In models that are treating very long input sequences, the
@@ -96,7 +94,7 @@ product has to be equal to `config.max_embedding_size`, which during training ha
 length* of the `input_ids`.
 
 
-## LSH Self Attention
+### LSH Self Attention
 
 In Locality sensitive hashing (LSH) self attention the key and query projection weights are tied. Therefore, the key
 query embedding vectors are also tied. LSH self attention uses the locality sensitive hashing mechanism proposed in
@@ -129,7 +127,7 @@ Using LSH self attention, the memory and time complexity of the query-key matmul
 and time bottleneck in a transformer model, with \\(n_s\\) being the sequence length.
 
 
-## Local Self Attention
+### Local Self Attention
 
 Local self attention is essentially a "normal" self attention layer with key, query and value projections, but is
 chunked so that in each chunk of length `config.local_chunk_length` the query embedding vectors only attends to
@@ -141,7 +139,7 @@ Using Local self attention, the memory and time complexity of the query-key matm
 and time bottleneck in a transformer model, with \\(n_s\\) being the sequence length.
 
 
-## Training
+### Training
 
 During training, we must ensure that the sequence length is set to a value that can be divided by the least common
 multiple of `config.lsh_chunk_length` and `config.local_chunk_length` and that the parameters of the Axial
@@ -155,7 +153,7 @@ input_ids = tokenizer.encode("This is a sentence from the training data", return
 loss = model(input_ids, labels=input_ids)[0]
 ```
 
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Question answering task guide](../tasks/question_answering)
diff --git a/docs/source/en/model_doc/regnet.md b/docs/source/en/model_doc/regnet.md
index 89e89459bd7..2a8f7e733d8 100644
--- a/docs/source/en/model_doc/regnet.md
+++ b/docs/source/en/model_doc/regnet.md
@@ -26,15 +26,13 @@ The abstract from the paper is the following:
 
 *In this work, we present a new network design paradigm. Our goal is to help advance the understanding of network design and discover design principles that generalize across settings. Instead of focusing on designing individual network instances, we design network design spaces that parametrize populations of networks. The overall process is analogous to classic manual design of networks, but elevated to the design space level. Using our methodology we explore the structure aspect of network design and arrive at a low-dimensional design space consisting of simple, regular networks that we call RegNet. The core insight of the RegNet parametrization is surprisingly simple: widths and depths of good networks can be explained by a quantized linear function. We analyze the RegNet design space and arrive at interesting findings that do not match the current practice of network design. The RegNet design space provides simple and fast networks that work well across a wide range of flop regimes. Under comparable training settings and flops, the RegNet models outperform the popular EfficientNet models while being up to 5x faster on GPUs.*
 
-Tips:
-
-- One can use [`AutoImageProcessor`] to prepare images for the model.
-- The huge 10B model from [Self-supervised Pretraining of Visual Features in the Wild](https://arxiv.org/abs/2103.01988), trained on one billion Instagram images, is available on the [hub](https://huggingface.co/facebook/regnet-y-10b-seer)
-
 This model was contributed by [Francesco](https://huggingface.co/Francesco). The TensorFlow version of the model
 was contributed by [sayakpaul](https://huggingface.com/sayakpaul) and [ariG23498](https://huggingface.com/ariG23498).
 The original code can be found [here](https://github.com/facebookresearch/pycls).
 
+The huge 10B model from [Self-supervised Pretraining of Visual Features in the Wild](https://arxiv.org/abs/2103.01988), 
+trained on  one billion Instagram images, is available on the [hub](https://huggingface.co/facebook/regnet-y-10b-seer)
+
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with RegNet.
@@ -50,37 +48,43 @@ If you're interested in submitting a resource to be included here, please feel f
 
 [[autodoc]] RegNetConfig
 
+<frameworkcontent>
+<pt>
 
 ## RegNetModel
 
 [[autodoc]] RegNetModel
     - forward
 
-
 ## RegNetForImageClassification
 
 [[autodoc]] RegNetForImageClassification
     - forward
 
+</pt>
+<tf>
+
 ## TFRegNetModel
 
 [[autodoc]] TFRegNetModel
     - call
 
-
 ## TFRegNetForImageClassification
 
 [[autodoc]] TFRegNetForImageClassification
     - call
 
+</tf>
+<jax>
 
 ## FlaxRegNetModel
 
 [[autodoc]] FlaxRegNetModel
     - __call__
 
-
 ## FlaxRegNetForImageClassification
 
 [[autodoc]] FlaxRegNetForImageClassification
-    - __call__
\ No newline at end of file
+    - __call__
+</jax>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/rembert.md b/docs/source/en/model_doc/rembert.md
index b2e4d0f5ada..b755d342306 100644
--- a/docs/source/en/model_doc/rembert.md
+++ b/docs/source/en/model_doc/rembert.md
@@ -34,14 +34,14 @@ Transformer representations to be more general and more transferable to other ta
 findings, we are able to train models that achieve strong performance on the XTREME benchmark without increasing the
 number of parameters at the fine-tuning stage.*
 
-Tips:
+## Usage tips
 
 For fine-tuning, RemBERT can be thought of as a bigger version of mBERT with an ALBERT-like factorization of the
 embedding layer. The embeddings are not tied in pre-training, in contrast with BERT, which enables smaller input
 embeddings (preserved during fine-tuning) and bigger output embeddings (discarded at fine-tuning). The tokenizer is
 also similar to the Albert one rather than the BERT one.
 
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
@@ -70,6 +70,9 @@ also similar to the Albert one rather than the BERT one.
     - create_token_type_ids_from_sequences
     - save_vocabulary
 
+<frameworkcontent>
+<pt>
+
 ## RemBertModel
 
 [[autodoc]] RemBertModel
@@ -105,6 +108,9 @@ also similar to the Albert one rather than the BERT one.
 [[autodoc]] RemBertForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFRemBertModel
 
 [[autodoc]] TFRemBertModel
@@ -139,3 +145,6 @@ also similar to the Albert one rather than the BERT one.
 
 [[autodoc]] TFRemBertForQuestionAnswering
     - call
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/resnet.md b/docs/source/en/model_doc/resnet.md
index 9bb36a776f1..b959266512f 100644
--- a/docs/source/en/model_doc/resnet.md
+++ b/docs/source/en/model_doc/resnet.md
@@ -27,10 +27,6 @@ The abstract from the paper is the following:
 *Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers---8x deeper than VGG nets but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers.
 The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28% relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC & COCO 2015 competitions, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation.*
 
-Tips:
-
-- One can use [`AutoImageProcessor`] to prepare images for the model.
-
 The figure below illustrates the architecture of ResNet. Taken from the [original paper](https://arxiv.org/abs/1512.03385).
 
 <img width="600" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/resnet_architecture.png"/>
@@ -52,30 +48,35 @@ If you're interested in submitting a resource to be included here, please feel f
 
 [[autodoc]] ResNetConfig
 
+<frameworkcontent>
+<pt>
 
 ## ResNetModel
 
 [[autodoc]] ResNetModel
     - forward
 
-
 ## ResNetForImageClassification
 
 [[autodoc]] ResNetForImageClassification
     - forward
 
+</pt>
+<tf>
 
 ## TFResNetModel
 
 [[autodoc]] TFResNetModel
     - call
 
-
 ## TFResNetForImageClassification
 
 [[autodoc]] TFResNetForImageClassification
     - call
 
+</tf>
+<jax>
+
 ## FlaxResNetModel
 
 [[autodoc]] FlaxResNetModel
@@ -85,3 +86,6 @@ If you're interested in submitting a resource to be included here, please feel f
 
 [[autodoc]] FlaxResNetForImageClassification
     - __call__
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/roberta-prelayernorm.md b/docs/source/en/model_doc/roberta-prelayernorm.md
index 9822fd7af96..000c0a7d2d8 100644
--- a/docs/source/en/model_doc/roberta-prelayernorm.md
+++ b/docs/source/en/model_doc/roberta-prelayernorm.md
@@ -25,15 +25,15 @@ The abstract from the paper is the following:
 
 *fairseq is an open-source sequence modeling toolkit that allows researchers and developers to train custom models for translation, summarization, language modeling, and other text generation tasks. The toolkit is based on PyTorch and supports distributed training across multiple GPUs and machines. We also support fast mixed-precision training and inference on modern GPUs.*
 
-Tips:
+This model was contributed by [andreasmaden](https://huggingface.co/andreasmaden).
+The original code can be found [here](https://github.com/princeton-nlp/DinkyTrain).
+
+## Usage tips
 
 - The implementation is the same as [Roberta](roberta) except instead of using _Add and Norm_ it does _Norm and Add_. _Add_ and _Norm_ refers to the Addition and LayerNormalization as described in [Attention Is All You Need](https://arxiv.org/abs/1706.03762).
 - This is identical to using the `--encoder-normalize-before` flag in [fairseq](https://fairseq.readthedocs.io/).
 
-This model was contributed by [andreasmaden](https://huggingface.co/andreasmaden).
-The original code can be found [here](https://github.com/princeton-nlp/DinkyTrain).
-
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
@@ -46,6 +46,9 @@ The original code can be found [here](https://github.com/princeton-nlp/DinkyTrai
 
 [[autodoc]] RobertaPreLayerNormConfig
 
+<frameworkcontent>
+<pt>
+
 ## RobertaPreLayerNormModel
 
 [[autodoc]] RobertaPreLayerNormModel
@@ -81,6 +84,9 @@ The original code can be found [here](https://github.com/princeton-nlp/DinkyTrai
 [[autodoc]] RobertaPreLayerNormForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFRobertaPreLayerNormModel
 
 [[autodoc]] TFRobertaPreLayerNormModel
@@ -116,6 +122,9 @@ The original code can be found [here](https://github.com/princeton-nlp/DinkyTrai
 [[autodoc]] TFRobertaPreLayerNormForQuestionAnswering
     - call
 
+</tf>
+<jax>
+
 ## FlaxRobertaPreLayerNormModel
 
 [[autodoc]] FlaxRobertaPreLayerNormModel
@@ -150,3 +159,6 @@ The original code can be found [here](https://github.com/princeton-nlp/DinkyTrai
 
 [[autodoc]] FlaxRobertaPreLayerNormForQuestionAnswering
     - __call__
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/roberta.md b/docs/source/en/model_doc/roberta.md
index 5a2ba6b5cf6..364b5b37e5f 100644
--- a/docs/source/en/model_doc/roberta.md
+++ b/docs/source/en/model_doc/roberta.md
@@ -47,7 +47,9 @@ model published after it. Our best model achieves state-of-the-art results on GL
 highlight the importance of previously overlooked design choices, and raise questions about the source of recently
 reported improvements. We release our models and code.*
 
-Tips:
+This model was contributed by [julien-c](https://huggingface.co/julien-c). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/roberta).
+
+## Usage tips
 
 - This implementation is the same as [`BertModel`] with a tiny embeddings tweak as well as a setup
   for Roberta pretrained models.
@@ -63,8 +65,6 @@ Tips:
     * use BPE with bytes as a subunit and not characters (because of unicode characters)
 - [CamemBERT](camembert) is a wrapper around RoBERTa. Refer to this page for usage examples.
 
-This model was contributed by [julien-c](https://huggingface.co/julien-c). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/roberta).
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with RoBERTa. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
@@ -127,6 +127,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] RobertaTokenizerFast
     - build_inputs_with_special_tokens
 
+<frameworkcontent>
+<pt>
+
 ## RobertaModel
 
 [[autodoc]] RobertaModel
@@ -162,6 +165,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] RobertaForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFRobertaModel
 
 [[autodoc]] TFRobertaModel
@@ -197,6 +203,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] TFRobertaForQuestionAnswering
     - call
 
+</tf>
+<jax>
+
 ## FlaxRobertaModel
 
 [[autodoc]] FlaxRobertaModel
@@ -231,3 +240,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] FlaxRobertaForQuestionAnswering
     - __call__
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/roc_bert.md b/docs/source/en/model_doc/roc_bert.md
index 831c656fb81..30fadd5c2c1 100644
--- a/docs/source/en/model_doc/roc_bert.md
+++ b/docs/source/en/model_doc/roc_bert.md
@@ -35,7 +35,7 @@ in the toxic content detection task under human-made attacks.*
 
 This model was contributed by [weiweishi](https://huggingface.co/weiweishi).
 
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
@@ -49,7 +49,6 @@ This model was contributed by [weiweishi](https://huggingface.co/weiweishi).
 [[autodoc]] RoCBertConfig
     - all
 
-
 ## RoCBertTokenizer
 
 [[autodoc]] RoCBertTokenizer
@@ -58,31 +57,26 @@ This model was contributed by [weiweishi](https://huggingface.co/weiweishi).
     - create_token_type_ids_from_sequences
     - save_vocabulary
 
-
 ## RoCBertModel
 
 [[autodoc]] RoCBertModel
     - forward
 
-
 ## RoCBertForPreTraining
 
 [[autodoc]] RoCBertForPreTraining
     - forward
 
-
 ## RoCBertForCausalLM
 
 [[autodoc]] RoCBertForCausalLM
     - forward
 
-
 ## RoCBertForMaskedLM
 
 [[autodoc]] RoCBertForMaskedLM
     - forward
 
-
 ## RoCBertForSequenceClassification
 
 [[autodoc]] transformers.RoCBertForSequenceClassification
@@ -93,14 +87,12 @@ This model was contributed by [weiweishi](https://huggingface.co/weiweishi).
 [[autodoc]] transformers.RoCBertForMultipleChoice
     - forward
 
-
 ## RoCBertForTokenClassification
 
 [[autodoc]] transformers.RoCBertForTokenClassification
     - forward
 
-
 ## RoCBertForQuestionAnswering
 
 [[autodoc]] RoCBertForQuestionAnswering
-    - forward
\ No newline at end of file
+    - forward
diff --git a/docs/source/en/model_doc/roformer.md b/docs/source/en/model_doc/roformer.md
index f15a1062965..5d8f146c43f 100644
--- a/docs/source/en/model_doc/roformer.md
+++ b/docs/source/en/model_doc/roformer.md
@@ -33,15 +33,13 @@ transformer with rotary position embedding, or RoFormer, achieves superior perfo
 release the theoretical analysis along with some preliminary experiment results on Chinese data. The undergoing
 experiment for English benchmark will soon be updated.*
 
-Tips:
-
-- RoFormer is a BERT-like autoencoding model with rotary position embeddings. Rotary position embeddings have shown
-  improved performance on classification tasks with long texts.
-
-
 This model was contributed by [junnyu](https://huggingface.co/junnyu). The original code can be found [here](https://github.com/ZhuiyiTechnology/roformer).
 
-## Documentation resources
+## Usage tips
+RoFormer is a BERT-like autoencoding model with rotary position embeddings. Rotary position embeddings have shown 
+improved performance on classification tasks with long texts.
+
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
@@ -67,6 +65,9 @@ This model was contributed by [junnyu](https://huggingface.co/junnyu). The origi
 [[autodoc]] RoFormerTokenizerFast
     - build_inputs_with_special_tokens
 
+<frameworkcontent>
+<pt>
+
 ## RoFormerModel
 
 [[autodoc]] RoFormerModel
@@ -102,6 +103,9 @@ This model was contributed by [junnyu](https://huggingface.co/junnyu). The origi
 [[autodoc]] RoFormerForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFRoFormerModel
 
 [[autodoc]] TFRoFormerModel
@@ -137,6 +141,9 @@ This model was contributed by [junnyu](https://huggingface.co/junnyu). The origi
 [[autodoc]] TFRoFormerForQuestionAnswering
     - call
 
+</tf>
+<jax>
+
 ## FlaxRoFormerModel
 
 [[autodoc]] FlaxRoFormerModel
@@ -166,3 +173,6 @@ This model was contributed by [junnyu](https://huggingface.co/junnyu). The origi
 
 [[autodoc]] FlaxRoFormerForQuestionAnswering
     - __call__
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/rwkv.md b/docs/source/en/model_doc/rwkv.md
index 9293db14cc6..3dfcf7ba4b5 100644
--- a/docs/source/en/model_doc/rwkv.md
+++ b/docs/source/en/model_doc/rwkv.md
@@ -27,7 +27,7 @@ This can be more efficient than a regular Transformer and can deal with sentence
 This model was contributed by [sgugger](https://huggingface.co/sgugger).
 The original code can be found [here](https://github.com/BlinkDL/RWKV-LM).
 
-Example of use as an RNN:
+## Usage example
 
 ```py
 import torch
@@ -73,7 +73,6 @@ output = model.generate(inputs["input_ids"], max_new_tokens=64, stopping_criteri
 
 [[autodoc]] RwkvConfig
 
-
 ## RwkvModel
 
 [[autodoc]] RwkvModel
diff --git a/docs/source/en/model_doc/segformer.md b/docs/source/en/model_doc/segformer.md
index 0f535351af5..4edd646cd4f 100644
--- a/docs/source/en/model_doc/segformer.md
+++ b/docs/source/en/model_doc/segformer.md
@@ -43,7 +43,7 @@ The figure below illustrates the architecture of SegFormer. Taken from the [orig
 This model was contributed by [nielsr](https://huggingface.co/nielsr). The TensorFlow version
 of the model was contributed by [sayakpaul](https://huggingface.co/sayakpaul). The original code can be found [here](https://github.com/NVlabs/SegFormer).
 
-Tips:
+## Usage tips
 
 - SegFormer consists of a hierarchical Transformer encoder, and a lightweight all-MLP decoder head.
   [`SegformerModel`] is the hierarchical Transformer encoder (which in the paper is also referred to
@@ -123,6 +123,9 @@ If you're interested in submitting a resource to be included here, please feel f
     - preprocess
     - post_process_semantic_segmentation
 
+<frameworkcontent>
+<pt>
+
 ## SegformerModel
 
 [[autodoc]] SegformerModel
@@ -143,6 +146,9 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] SegformerForSemanticSegmentation
     - forward
 
+</pt>
+<tf>
+
 ## TFSegformerDecodeHead
 
 [[autodoc]] TFSegformerDecodeHead
@@ -162,3 +168,6 @@ If you're interested in submitting a resource to be included here, please feel f
 
 [[autodoc]] TFSegformerForSemanticSegmentation
     - call
+
+</tf>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/en/model_doc/sew-d.md b/docs/source/en/model_doc/sew-d.md
index b70c59061b5..013e404bd04 100644
--- a/docs/source/en/model_doc/sew-d.md
+++ b/docs/source/en/model_doc/sew-d.md
@@ -32,15 +32,15 @@ variety of training setups. For example, under the 100h-960h semi-supervised set
 inference speedup compared to wav2vec 2.0, with a 13.5% relative reduction in word error rate. With a similar inference
 time, SEW reduces word error rate by 25-50% across different model sizes.*
 
-Tips:
+This model was contributed by [anton-l](https://huggingface.co/anton-l).
+
+## Usage tips
 
 - SEW-D is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
 - SEWDForCTC is fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded
   using [`Wav2Vec2CTCTokenizer`].
 
-This model was contributed by [anton-l](https://huggingface.co/anton-l).
-
-## Documentation resources
+## Resources
 
 - [Audio classification task guide](../tasks/audio_classification)
 - [Automatic speech recognition task guide](../tasks/asr)
diff --git a/docs/source/en/model_doc/sew.md b/docs/source/en/model_doc/sew.md
index ebf128ea429..ee8a36a4dcb 100644
--- a/docs/source/en/model_doc/sew.md
+++ b/docs/source/en/model_doc/sew.md
@@ -32,15 +32,15 @@ variety of training setups. For example, under the 100h-960h semi-supervised set
 inference speedup compared to wav2vec 2.0, with a 13.5% relative reduction in word error rate. With a similar inference
 time, SEW reduces word error rate by 25-50% across different model sizes.*
 
-Tips:
+This model was contributed by [anton-l](https://huggingface.co/anton-l).
+
+## Usage tips
 
 - SEW is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
 - SEWForCTC is fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded using
   [`Wav2Vec2CTCTokenizer`].
 
-This model was contributed by [anton-l](https://huggingface.co/anton-l).
-
-## Documentation resources
+## Resources
 
 - [Audio classification task guide](../tasks/audio_classification)
 - [Automatic speech recognition task guide](../tasks/asr)
diff --git a/docs/source/en/model_doc/speech_to_text.md b/docs/source/en/model_doc/speech_to_text.md
index cb13a1871ae..23512b323af 100644
--- a/docs/source/en/model_doc/speech_to_text.md
+++ b/docs/source/en/model_doc/speech_to_text.md
@@ -27,7 +27,6 @@ transcripts/translations autoregressively. Speech2Text has been fine-tuned on se
 
 This model was contributed by [valhalla](https://huggingface.co/valhalla). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/speech_to_text).
 
-
 ## Inference
 
 Speech2Text is a speech model that accepts a float tensor of log-mel filter-bank features extracted from the speech
@@ -44,7 +43,6 @@ install those packages before running the examples. You could either install tho
 `pip install transformers"[speech, sentencepiece]"` or install the packages separately with `pip install torchaudio sentencepiece`. Also `torchaudio` requires the development version of the [libsndfile](http://www.mega-nerd.com/libsndfile/) package which can be installed via a system package manager. On Ubuntu it can
 be installed as follows: `apt install libsndfile1-dev`
 
-
 - ASR and Speech Translation
 
 ```python
@@ -98,7 +96,6 @@ be installed as follows: `apt install libsndfile1-dev`
 
 See the [model hub](https://huggingface.co/models?filter=speech_to_text) to look for Speech2Text checkpoints.
 
-
 ## Speech2TextConfig
 
 [[autodoc]] Speech2TextConfig
@@ -125,6 +122,9 @@ See the [model hub](https://huggingface.co/models?filter=speech_to_text) to look
     - batch_decode
     - decode
 
+<frameworkcontent>
+<pt>
+
 ## Speech2TextModel
 
 [[autodoc]] Speech2TextModel
@@ -135,6 +135,9 @@ See the [model hub](https://huggingface.co/models?filter=speech_to_text) to look
 [[autodoc]] Speech2TextForConditionalGeneration
     - forward
 
+</pt>
+<tf>
+
 ## TFSpeech2TextModel
 
 [[autodoc]] TFSpeech2TextModel
@@ -144,3 +147,6 @@ See the [model hub](https://huggingface.co/models?filter=speech_to_text) to look
 
 [[autodoc]] TFSpeech2TextForConditionalGeneration
     - call
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/speech_to_text_2.md b/docs/source/en/model_doc/speech_to_text_2.md
index 1abdeced580..6648e67f629 100644
--- a/docs/source/en/model_doc/speech_to_text_2.md
+++ b/docs/source/en/model_doc/speech_to_text_2.md
@@ -31,8 +31,7 @@ This model was contributed by [Patrick von Platen](https://huggingface.co/patric
 
 The original code can be found [here](https://github.com/pytorch/fairseq/blob/1f7ef9ed1e1061f8c7f88f8b94c7186834398690/fairseq/models/wav2vec/wav2vec2_asr.py#L266).
 
-
-Tips:
+## Usage tips
 
 - Speech2Text2 achieves state-of-the-art results on the CoVoST Speech Translation dataset. For more information, see
   the [official models](https://huggingface.co/models?other=speech2text2) .
@@ -98,7 +97,7 @@ predicted token ids.
 
 See [model hub](https://huggingface.co/models?filter=speech2text2) to look for Speech2Text2 checkpoints.
 
-## Documentation resources
+## Resources
 
 - [Causal language modeling task guide](../tasks/language_modeling)
 
diff --git a/docs/source/en/model_doc/splinter.md b/docs/source/en/model_doc/splinter.md
index f16169d9b21..a46c55966c0 100644
--- a/docs/source/en/model_doc/splinter.md
+++ b/docs/source/en/model_doc/splinter.md
@@ -34,7 +34,9 @@ are replaced with a special token, viewed as a question representation, that is
 the answer span. The resulting model obtains surprisingly good results on multiple benchmarks (e.g., 72.7 F1 on SQuAD
 with only 128 training examples), while maintaining competitive performance in the high-resource setting.
 
-Tips:
+This model was contributed by [yuvalkirstain](https://huggingface.co/yuvalkirstain) and [oriram](https://huggingface.co/oriram). The original code can be found [here](https://github.com/oriram/splinter).
+
+## Usage tips
 
 - Splinter was trained to predict answers spans conditioned on a special [QUESTION] token. These tokens contextualize
   to question representations which are used to predict the answers. This layer is called QASS, and is the default
@@ -49,9 +51,7 @@ Tips:
   doesn't (*tau/splinter-base* and *tau/splinter-large*). This is done to support randomly initializing this layer at
   fine-tuning, as it is shown to yield better results for some cases in the paper.
 
-This model was contributed by [yuvalkirstain](https://huggingface.co/yuvalkirstain) and [oriram](https://huggingface.co/oriram). The original code can be found [here](https://github.com/oriram/splinter).
-
-## Documentation resources
+## Resources
 
 - [Question answering task guide](../tasks/question-answering)
 
diff --git a/docs/source/en/model_doc/squeezebert.md b/docs/source/en/model_doc/squeezebert.md
index 515a2ef3178..e2bb378fe5b 100644
--- a/docs/source/en/model_doc/squeezebert.md
+++ b/docs/source/en/model_doc/squeezebert.md
@@ -38,7 +38,9 @@ self-attention layers with grouped convolutions, and we use this technique in a
 SqueezeBERT, which runs 4.3x faster than BERT-base on the Pixel 3 while achieving competitive accuracy on the GLUE test
 set. The SqueezeBERT code will be released.*
 
-Tips:
+This model was contributed by [forresti](https://huggingface.co/forresti).
+
+## Usage tips
 
 - SqueezeBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
   rather than the left.
@@ -48,9 +50,7 @@ Tips:
 - For best results when finetuning on sequence classification tasks, it is recommended to start with the
   *squeezebert/squeezebert-mnli-headless* checkpoint.
 
-This model was contributed by [forresti](https://huggingface.co/forresti).
-
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
diff --git a/docs/source/en/model_doc/swiftformer.md b/docs/source/en/model_doc/swiftformer.md
index 67c9597d212..30c6941f0f4 100644
--- a/docs/source/en/model_doc/swiftformer.md
+++ b/docs/source/en/model_doc/swiftformer.md
@@ -26,14 +26,9 @@ The abstract from the paper is the following:
 
 *Self-attention has become a defacto choice for capturing global context in various vision applications. However, its quadratic computational complexity with respect to image resolution limits its use in real-time applications, especially for deployment on resource-constrained mobile devices. Although hybrid approaches have been proposed to combine the advantages of convolutions and self-attention for a better speed-accuracy trade-off, the expensive matrix multiplication operations in self-attention remain a bottleneck. In this work, we introduce a novel efficient additive attention mechanism that effectively replaces the quadratic matrix multiplication operations with linear element-wise multiplications. Our design shows that the key-value interaction can be replaced with a linear layer without sacrificing any accuracy. Unlike previous state-of-the-art methods, our efficient formulation of self-attention enables its usage at all stages of the network. Using our proposed efficient additive attention, we build a series of models called "SwiftFormer" which achieves state-of-the-art performance in terms of both accuracy and mobile inference speed. Our small variant achieves 78.5% top-1 ImageNet-1K accuracy with only 0.8 ms latency on iPhone 14, which is more accurate and 2x faster compared to MobileViT-v2.*
 
-Tips:
-    - One can use the [`ViTImageProcessor`] API to prepare images for the model.
-
-
 This model was contributed by [shehan97](https://huggingface.co/shehan97).
 The original code can be found [here](https://github.com/Amshaker/SwiftFormer).
 
-
 ## SwiftFormerConfig
 
 [[autodoc]] SwiftFormerConfig
diff --git a/docs/source/en/model_doc/swin.md b/docs/source/en/model_doc/swin.md
index 37bb86db951..e23c882a3f0 100644
--- a/docs/source/en/model_doc/swin.md
+++ b/docs/source/en/model_doc/swin.md
@@ -36,11 +36,6 @@ prediction tasks such as object detection (58.7 box AP and 51.1 mask AP on COCO
 +2.6 mask AP on COCO, and +3.2 mIoU on ADE20K, demonstrating the potential of Transformer-based models as vision backbones.
 The hierarchical design and the shifted window approach also prove beneficial for all-MLP architectures.*
 
-Tips:
-- One can use the [`AutoImageProcessor`] API to prepare images for the model.
-- Swin pads the inputs supporting any input height and width (if divisible by `32`).
-- Swin can be used as a *backbone*. When `output_hidden_states = True`, it will output both `hidden_states` and `reshaped_hidden_states`. The `reshaped_hidden_states` have a shape of `(batch, num_channels, height, width)` rather than `(batch_size, sequence_length, num_channels)`.
-
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/swin_transformer_architecture.png"
 alt="drawing" width="600"/>
 
@@ -48,6 +43,10 @@ alt="drawing" width="600"/>
 
 This model was contributed by [novice03](https://huggingface.co/novice03). The Tensorflow version of this model was contributed by [amyeroberts](https://huggingface.co/amyeroberts). The original code can be found [here](https://github.com/microsoft/Swin-Transformer).
 
+## Usage tips
+
+- Swin pads the inputs supporting any input height and width (if divisible by `32`).
+- Swin can be used as a *backbone*. When `output_hidden_states = True`, it will output both `hidden_states` and `reshaped_hidden_states`. The `reshaped_hidden_states` have a shape of `(batch, num_channels, height, width)` rather than `(batch_size, sequence_length, num_channels)`.
 
 ## Resources
 
@@ -68,6 +67,8 @@ If you're interested in submitting a resource to be included here, please feel f
 
 [[autodoc]] SwinConfig
 
+<frameworkcontent>
+<pt>
 
 ## SwinModel
 
@@ -84,6 +85,9 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] transformers.SwinForImageClassification
     - forward
 
+</pt>
+<tf>
+
 ## TFSwinModel
 
 [[autodoc]] TFSwinModel
@@ -98,3 +102,6 @@ If you're interested in submitting a resource to be included here, please feel f
 
 [[autodoc]] transformers.TFSwinForImageClassification
     - call
+
+</tf>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/en/model_doc/swinv2.md b/docs/source/en/model_doc/swinv2.md
index e08389527ec..25233dca339 100644
--- a/docs/source/en/model_doc/swinv2.md
+++ b/docs/source/en/model_doc/swinv2.md
@@ -24,9 +24,6 @@ The abstract from the paper is the following:
 
 *Large-scale NLP models have been shown to significantly improve the performance on language tasks with no signs of saturation. They also demonstrate amazing few-shot capabilities like that of human beings. This paper aims to explore large-scale models in computer vision. We tackle three major issues in training and application of large vision models, including training instability, resolution gaps between pre-training and fine-tuning, and hunger on labelled data. Three main techniques are proposed: 1) a residual-post-norm method combined with cosine attention to improve training stability; 2) A log-spaced continuous position bias method to effectively transfer models pre-trained using low-resolution images to downstream tasks with high-resolution inputs; 3) A self-supervised pre-training method, SimMIM, to reduce the needs of vast labeled images. Through these techniques, this paper successfully trained a 3 billion-parameter Swin Transformer V2 model, which is the largest dense vision model to date, and makes it capable of training with images of up to 1,536×1,536 resolution. It set new performance records on 4 representative vision tasks, including ImageNet-V2 image classification, COCO object detection, ADE20K semantic segmentation, and Kinetics-400 video action classification. Also note our training is much more efficient than that in Google's billion-level visual models, which consumes 40 times less labelled data and 40 times less training time.*
 
-Tips:
-- One can use the [`AutoImageProcessor`] API to prepare images for the model.
-
 This model was contributed by [nandwalritik](https://huggingface.co/nandwalritik).
 The original code can be found [here](https://github.com/microsoft/Swin-Transformer).
 
diff --git a/docs/source/en/model_doc/switch_transformers.md b/docs/source/en/model_doc/switch_transformers.md
index 8f6a231b7ef..5080f711ace 100644
--- a/docs/source/en/model_doc/switch_transformers.md
+++ b/docs/source/en/model_doc/switch_transformers.md
@@ -23,19 +23,18 @@ The SwitchTransformers model was proposed in [Switch Transformers: Scaling to Tr
 The Switch Transformer model uses a sparse T5 encoder-decoder architecture, where the MLP are replaced by a Mixture of Experts (MoE). A routing mechanism (top 1 in this case) associates each token to one of the expert, where each expert is a dense MLP. While switch transformers have a lot more weights than their equivalent dense models, the sparsity allows better scaling and better finetuning performance at scale.
 During a forward pass, only a fraction of the weights are used. The routing mechanism allows the model to select relevant weights on the fly which increases the model capacity without increasing the number of operations.
 
-
 The abstract from the paper is the following:
 
 *In deep learning, models typically reuse the same parameters for all inputs. Mixture of Experts (MoE) defies this and instead selects different parameters for each incoming example. The result is a sparsely-activated model -- with outrageous numbers of parameters -- but a constant computational cost. However, despite several notable successes of MoE, widespread adoption has been hindered by complexity, communication costs and training instability -- we address these with the Switch Transformer. We simplify the MoE routing algorithm and design intuitive improved models with reduced communication and computational costs. Our proposed training techniques help wrangle the instabilities and we show large sparse models may be trained, for the first time, with lower precision (bfloat16) formats. We design models based off T5-Base and T5-Large to obtain up to 7x increases in pre-training speed with the same computational resources. These improvements extend into multilingual settings where we measure gains over the mT5-Base version across all 101 languages. Finally, we advance the current scale of language models by pre-training up to trillion parameter models on the "Colossal Clean Crawled Corpus" and achieve a 4x speedup over the T5-XXL model.*
 
-Tips:
+This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) and [Arthur Zucker](https://huggingface.co/ArtZucker) .
+The original code can be found [here](https://github.com/google/flaxformer/tree/main/flaxformer/architectures/moe).
+
+## Usage tips
 
 - SwitchTransformers uses the [`T5Tokenizer`], which can be loaded directly from each model's repository.
 - The released weights are pretrained on English [Masked Language Modeling](https://moon-ci-docs.huggingface.co/docs/transformers/pr_19323/en/glossary#general-terms) task, and should be finetuned.
 
-This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) and [Arthur Zucker](https://huggingface.co/ArtZucker) .
-The original code can be found [here](https://github.com/google/flaxformer/tree/main/flaxformer/architectures/moe).
-
 ## Resources
 
 - [Translation task guide](../tasks/translation)
diff --git a/docs/source/en/model_doc/t5.md b/docs/source/en/model_doc/t5.md
index 2e833e8e1a6..704d05987b9 100644
--- a/docs/source/en/model_doc/t5.md
+++ b/docs/source/en/model_doc/t5.md
@@ -45,7 +45,11 @@ with scale and our new "Colossal Clean Crawled Corpus", we achieve state-of-the-
 summarization, question answering, text classification, and more. To facilitate future work on transfer learning for
 NLP, we release our dataset, pre-trained models, and code.*
 
-Tips:
+All checkpoints can be found on the [hub](https://huggingface.co/models?search=t5).
+
+This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/google-research/text-to-text-transfer-transformer).
+
+## Usage tips
 
 - T5 is an encoder-decoder model pre-trained on a multi-task mixture of unsupervised and supervised tasks and for which
 each task is converted into a text-to-text format. T5 works well on a variety of tasks out-of-the-box by prepending a
@@ -91,12 +95,6 @@ Based on the original T5 model, Google has released some follow-up works:
 - **UMT5**: UmT5 is a multilingual T5 model trained on an improved and refreshed mC4 multilingual corpus,  29 trillion characters across 107 language, using a new sampling method, UniMax. Refer to
  the documentation of mT5 which can be found [here](umt5).
 
-All checkpoints can be found on the [hub](https://huggingface.co/models?search=t5).
-
-This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/google-research/text-to-text-transfer-transformer).
-
-<a id='training'></a>
-
 ## Training
 
 T5 is an encoder-decoder model and converts all NLP problems into a text-to-text format. It is trained using teacher
@@ -249,8 +247,6 @@ batches to the longest example is not recommended on TPU as it triggers a recomp
 encountered during training thus significantly slowing down the training. only padding up to the longest example in a
 batch) leads to very slow training on TPU.
 
-<a id='inference'></a>
-
 ## Inference
 
 At inference time, it is recommended to use [`~generation.GenerationMixin.generate`]. This
@@ -316,9 +312,6 @@ The predicted tokens will then be placed between the sentinel tokens.
 ['<pad><extra_id_0> park offers<extra_id_1> the<extra_id_2> park.</s>']
 ```
 
-
-<a id='scripts'></a>
-
 ## Performance
 
 If you'd like a faster training and inference performance, install [apex](https://github.com/NVIDIA/apex#quick-start) and then the model will automatically use `apex.normalization.FusedRMSNorm` instead of `T5LayerNorm`. The former uses an optimized fused kernel which is several times faster than the latter.
@@ -386,6 +379,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] T5TokenizerFast
 
+<frameworkcontent>
+<pt>
+
 ## T5Model
 
 [[autodoc]] T5Model
@@ -411,6 +407,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] T5ForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFT5Model
 
 [[autodoc]] TFT5Model
@@ -426,6 +425,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] TFT5EncoderModel
     - call
 
+</tf>
+<jax>
+
 ## FlaxT5Model
 
 [[autodoc]] FlaxT5Model
@@ -444,3 +446,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] FlaxT5EncoderModel
     - __call__
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/t5v1.1.md b/docs/source/en/model_doc/t5v1.1.md
index 900e26f521d..e18696f629d 100644
--- a/docs/source/en/model_doc/t5v1.1.md
+++ b/docs/source/en/model_doc/t5v1.1.md
@@ -20,6 +20,10 @@ rendered properly in your Markdown viewer.
 
 T5v1.1 was released in the [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511)
 repository by Colin Raffel et al. It's an improved version of the original T5 model.
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The original code can be
+found [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511).
+
+## Usage tips
 
 One can directly plug in the weights of T5v1.1 into a T5 model, like so:
 
@@ -59,7 +63,9 @@ Google has released the following variants:
 
 - [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl).
 
-One can refer to [T5's documentation page](t5) for all tips, code examples and notebooks.
 
-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The original code can be
-found [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511).
+<Tip>
+
+Refer to [T5's documentation page](t5) for all API reference, tips, code examples and notebooks.
+
+</Tip>
\ No newline at end of file
diff --git a/docs/source/en/model_doc/table-transformer.md b/docs/source/en/model_doc/table-transformer.md
index 7ea7ae8cd35..850e7f50aa6 100644
--- a/docs/source/en/model_doc/table-transformer.md
+++ b/docs/source/en/model_doc/table-transformer.md
@@ -33,16 +33,15 @@ significant increase in training performance and a more reliable estimate of mod
 object detection models trained on PubTables-1M produce excellent results for all three tasks of detection, structure recognition, and functional analysis without the need for any
 special customization for these tasks.*
 
-Tips:
-
-- The authors released 2 models, one for [table detection](https://huggingface.co/microsoft/table-transformer-detection) in documents, one for [table structure recognition](https://huggingface.co/microsoft/table-transformer-structure-recognition) (the task of recognizing the individual rows, columns etc. in a table).
-- One can use the [`AutoImageProcessor`] API to prepare images and optional targets for the model. This will load a [`DetrImageProcessor`] behind the scenes.
-
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/table_transformer_architecture.jpeg"
 alt="drawing" width="600"/>
 
 <small> Table detection and table structure recognition clarified. Taken from the <a href="https://arxiv.org/abs/2110.00061">original paper</a>. </small>
 
+The authors released 2 models, one for [table detection](https://huggingface.co/microsoft/table-transformer-detection) in 
+documents, one for [table structure recognition](https://huggingface.co/microsoft/table-transformer-structure-recognition) 
+(the task of recognizing the individual rows, columns etc. in a table).
+
 This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be
 found [here](https://github.com/microsoft/table-transformer).
 
diff --git a/docs/source/en/model_doc/tapas.md b/docs/source/en/model_doc/tapas.md
index 1c76015f285..78d2f3ee138 100644
--- a/docs/source/en/model_doc/tapas.md
+++ b/docs/source/en/model_doc/tapas.md
@@ -44,7 +44,7 @@ alt="drawing" width="600"/>
 
 This model was contributed by [nielsr](https://huggingface.co/nielsr). The Tensorflow version of this model was contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/google-research/tapas).
 
-Tips:
+## Usage tips
 
 - TAPAS is a model that uses relative position embeddings by default (restarting the position embeddings at every cell of the table). Note that this is something that was added after the publication of the original TAPAS paper. According to the authors, this usually results in a slightly better performance, and allows you to encode longer sequences without running out of embeddings. This is reflected in the `reset_position_index_per_cell` parameter of [`TapasConfig`], which is set to `True` by default. The default versions of the models available on the [hub](https://huggingface.co/models?search=tapas) all use relative position embeddings. You can still use the ones with absolute position embeddings by passing in an additional argument `revision="no_reset"` when calling the `from_pretrained()` method. Note that it's usually advised to pad the inputs on the right rather than the left.
 - TAPAS is based on BERT, so `TAPAS-base` for example corresponds to a `BERT-base` architecture. Of course, `TAPAS-large` will result in the best performance (the results reported in the paper are from `TAPAS-large`). Results of the various sized models are shown on the [original Github repository](https://github.com/google-research/tapas>).
@@ -573,7 +573,7 @@ Predicted answer: SUM > 87, 53, 69
 
 In case of a conversational set-up, then each table-question pair must be provided **sequentially** to the model, such that the `prev_labels` token types can be overwritten by the predicted `labels` of the previous table-question pair. Again, more info can be found in [this notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) (for PyTorch) and [this notebook](https://github.com/kamalkraj/Tapas-Tutorial/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) (for TensorFlow).
 
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Masked language modeling task guide](../tasks/masked_language_modeling)
@@ -590,6 +590,9 @@ In case of a conversational set-up, then each table-question pair must be provid
     - convert_logits_to_predictions
     - save_vocabulary
 
+<frameworkcontent>
+<pt>
+
 ## TapasModel
 [[autodoc]] TapasModel
     - forward
@@ -606,6 +609,9 @@ In case of a conversational set-up, then each table-question pair must be provid
 [[autodoc]] TapasForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFTapasModel
 [[autodoc]] TFTapasModel
     - call
@@ -620,4 +626,9 @@ In case of a conversational set-up, then each table-question pair must be provid
     
 ## TFTapasForQuestionAnswering
 [[autodoc]] TFTapasForQuestionAnswering
-    - call
\ No newline at end of file
+    - call
+
+</tf>
+</frameworkcontent>
+
+
diff --git a/docs/source/en/model_doc/tapex.md b/docs/source/en/model_doc/tapex.md
index 52234b5c59b..15ac2463fd8 100644
--- a/docs/source/en/model_doc/tapex.md
+++ b/docs/source/en/model_doc/tapex.md
@@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
 
 <Tip warning={true}>
 
-This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
 
 If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
 You can do so by running the following command: `pip install -U transformers==4.30.0`.
@@ -49,7 +49,7 @@ on the weakly-supervised WikiSQL denotation accuracy to 89.5% (+2.3%), the WikiT
 to 74.5% (+3.5%), and the TabFact accuracy to 84.2% (+3.2%). To our knowledge, this is the first work to exploit table pre-training via synthetic executable programs
 and to achieve new state-of-the-art results on various downstream tasks.*
 
-Tips:
+## Usage tips
 
 - TAPEX is a generative (seq2seq) model. One can directly plug in the weights of TAPEX into a BART model. 
 - TAPEX has checkpoints on the hub that are either pre-trained only, or fine-tuned on WTQ, SQA, WikiSQL and TabFact.
@@ -58,7 +58,7 @@ Tips:
 - TAPEX has its own tokenizer, that allows to prepare all data for the model easily. One can pass Pandas DataFrames and strings to the tokenizer,
   and it will automatically create the `input_ids` and `attention_mask` (as shown in the usage examples below). 
 
-## Usage: inference
+### Usage: inference
 
 Below, we illustrate how to use TAPEX for table question answering. As one can see, one can directly plug in the weights of TAPEX into a BART model.
 We use the [Auto API](auto), which will automatically instantiate the appropriate tokenizer ([`TapexTokenizer`]) and model ([`BartForConditionalGeneration`]) for us,
@@ -135,6 +135,12 @@ benchmark for table fact checking (it achieves 84% accuracy). The code example b
 Refused
 ```
 
+<Tip> 
+
+TAPEX architecture is the same as BART, except for tokenization. Refer to [BART documentation](bart) for information on 
+configuration classes and their parameters. TAPEX-specific tokenizer is documented below.  
+
+</Tip>
 
 ## TapexTokenizer
 
diff --git a/docs/source/en/model_doc/time_series_transformer.md b/docs/source/en/model_doc/time_series_transformer.md
index 208798aa1c6..c5bfcfc15ea 100644
--- a/docs/source/en/model_doc/time_series_transformer.md
+++ b/docs/source/en/model_doc/time_series_transformer.md
@@ -16,18 +16,12 @@ rendered properly in your Markdown viewer.
 
 # Time Series Transformer
 
-<Tip>
-
-This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
-breaking changes to fix it in the future. If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title).
-
-</Tip>
-
 ## Overview
 
 The Time Series Transformer model is a vanilla encoder-decoder Transformer for time series forecasting.
+This model was contributed by [kashif](https://huggingface.co/kashif).
 
-Tips:
+## Usage tips
 
 - Similar to other models in the library, [`TimeSeriesTransformerModel`] is the raw Transformer without any head on top, and [`TimeSeriesTransformerForPrediction`]
 adds a distribution head on top of the former, which can be used for time-series forecasting. Note that this is a so-called probabilistic forecasting model, not a
@@ -56,9 +50,6 @@ of the context as initial input for the decoder).
 - At inference time, we give the final value of the `past_values` as input to the decoder. Next, we can sample from the model to make a prediction at the next time step,
 which is then fed to the decoder in order to make the next prediction (also called autoregressive generation).
 
-
-This model was contributed by [kashif](https://huggingface.co/kashif).
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
@@ -70,13 +61,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] TimeSeriesTransformerConfig
 
-
 ## TimeSeriesTransformerModel
 
 [[autodoc]] TimeSeriesTransformerModel
     - forward
 
-
 ## TimeSeriesTransformerForPrediction
 
 [[autodoc]] TimeSeriesTransformerForPrediction
diff --git a/docs/source/en/model_doc/timesformer.md b/docs/source/en/model_doc/timesformer.md
index d87fde4fb2b..fe75bee5b28 100644
--- a/docs/source/en/model_doc/timesformer.md
+++ b/docs/source/en/model_doc/timesformer.md
@@ -25,14 +25,15 @@ The abstract from the paper is the following:
 
 *We present a convolution-free approach to video classification built exclusively on self-attention over space and time. Our method, named "TimeSformer," adapts the standard Transformer architecture to video by enabling spatiotemporal feature learning directly from a sequence of frame-level patches. Our experimental study compares different self-attention schemes and suggests that "divided attention," where temporal attention and spatial attention are separately applied within each block, leads to the best video classification accuracy among the design choices considered. Despite the radically new design, TimeSformer achieves state-of-the-art results on several action recognition benchmarks, including the best reported accuracy on Kinetics-400 and Kinetics-600. Finally, compared to 3D convolutional networks, our model is faster to train, it can achieve dramatically higher test efficiency (at a small drop in accuracy), and it can also be applied to much longer video clips (over one minute long). Code and models are available at: [this https URL](https://github.com/facebookresearch/TimeSformer).*
 
-Tips:
-
-There are many pretrained variants. Select your pretrained model based on the dataset it is trained on. Moreover, the number of input frames per clip changes based on the model size so you should consider this parameter while selecting your pretrained model.
-
 This model was contributed by [fcakyon](https://huggingface.co/fcakyon).
 The original code can be found [here](https://github.com/facebookresearch/TimeSformer).
 
-## Documentation resources
+## Usage tips
+
+There are many pretrained variants. Select your pretrained model based on the dataset it is trained on. Moreover,
+the number of input frames per clip changes based on the model size so you should consider this parameter while selecting your pretrained model.
+
+## Resources
 
 - [Video classification task guide](../tasks/video_classification)
 
diff --git a/docs/source/en/model_doc/trajectory_transformer.md b/docs/source/en/model_doc/trajectory_transformer.md
index 548642f7bb9..45616255871 100644
--- a/docs/source/en/model_doc/trajectory_transformer.md
+++ b/docs/source/en/model_doc/trajectory_transformer.md
@@ -43,19 +43,18 @@ in offline RL algorithms. We demonstrate the flexibility of this approach across
 imitation learning, goal-conditioned RL, and offline RL. Further, we show that this approach can be combined with
 existing model-free algorithms to yield a state-of-the-art planner in sparse-reward, long-horizon tasks.*
 
-Tips:
+This model was contributed by [CarlCochet](https://huggingface.co/CarlCochet). The original code can be found [here](https://github.com/jannerm/trajectory-transformer).
+
+## Usage tips
 
 This Transformer is used for deep reinforcement learning. To use it, you need to create sequences from
 actions, states and rewards from all previous timesteps. This model will treat all these elements together
 as one big sequence (a trajectory).
 
-This model was contributed by [CarlCochet](https://huggingface.co/CarlCochet). The original code can be found [here](https://github.com/jannerm/trajectory-transformer).
-
 ## TrajectoryTransformerConfig
 
 [[autodoc]] TrajectoryTransformerConfig
 
-
 ## TrajectoryTransformerModel
 
 [[autodoc]] TrajectoryTransformerModel
diff --git a/docs/source/en/model_doc/transfo-xl.md b/docs/source/en/model_doc/transfo-xl.md
index beb5ba2fea8..d75e3a37b99 100644
--- a/docs/source/en/model_doc/transfo-xl.md
+++ b/docs/source/en/model_doc/transfo-xl.md
@@ -45,7 +45,9 @@ bpc/perplexity to 0.99 on enwiki8, 1.08 on text8, 18.3 on WikiText-103, 21.8 on
 Treebank (without finetuning). When trained only on WikiText-103, Transformer-XL manages to generate reasonably
 coherent, novel text articles with thousands of tokens.*
 
-Tips:
+This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/kimiyoung/transformer-xl).
+
+## Usage tips
 
 - Transformer-XL uses relative sinusoidal positional embeddings. Padding can be done on the left or on the right. The
   original implementation trains on SQuAD with padding on the left, therefore the padding defaults are set to left.
@@ -54,7 +56,6 @@ Tips:
 - Basically, the hidden states of the previous segment are concatenated to the current input to compute the attention scores. This allows the model to pay attention to information that was in the previous segment as well as the current one. By stacking multiple attention layers, the receptive field can be increased to multiple previous segments.
 - This changes the positional embeddings to positional relative embeddings (as the regular positional embeddings would give the same results in the current input and the current hidden state at a given position) and needs to make some adjustments in the way attention scores are computed.
 
-This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/kimiyoung/transformer-xl).
 
 <Tip warning={true}>
 
@@ -62,7 +63,7 @@ TransformerXL does **not** work with *torch.nn.DataParallel* due to a bug in PyT
 
 </Tip>
 
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Causal language modeling task guide](../tasks/language_modeling)
@@ -86,6 +87,9 @@ TransformerXL does **not** work with *torch.nn.DataParallel* due to a bug in PyT
 
 [[autodoc]] models.transfo_xl.modeling_tf_transfo_xl.TFTransfoXLLMHeadModelOutput
 
+<frameworkcontent>
+<pt>
+
 ## TransfoXLModel
 
 [[autodoc]] TransfoXLModel
@@ -101,6 +105,9 @@ TransformerXL does **not** work with *torch.nn.DataParallel* due to a bug in PyT
 [[autodoc]] TransfoXLForSequenceClassification
     - forward
 
+</pt>
+<tf>
+
 ## TFTransfoXLModel
 
 [[autodoc]] TFTransfoXLModel
@@ -116,6 +123,9 @@ TransformerXL does **not** work with *torch.nn.DataParallel* due to a bug in PyT
 [[autodoc]] TFTransfoXLForSequenceClassification
     - call
 
+</tf>
+</frameworkcontent>
+
 ## Internal Layers
 
 [[autodoc]] AdaptiveEmbedding
diff --git a/docs/source/en/model_doc/trocr.md b/docs/source/en/model_doc/trocr.md
index bfab93ad663..c471a13bbd2 100644
--- a/docs/source/en/model_doc/trocr.md
+++ b/docs/source/en/model_doc/trocr.md
@@ -43,7 +43,7 @@ Please refer to the [`VisionEncoderDecoder`] class on how to use this model.
 This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found
 [here](https://github.com/microsoft/unilm/tree/6f60612e7cc86a2a1ae85c47231507a587ab4e01/trocr).
 
-Tips:
+## Usage tips
 
 - The quickest way to get started with TrOCR is by checking the [tutorial
   notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/TrOCR), which show how to use the model
diff --git a/docs/source/en/model_doc/tvlt.md b/docs/source/en/model_doc/tvlt.md
index 5ddb6badb71..f09ea8af863 100644
--- a/docs/source/en/model_doc/tvlt.md
+++ b/docs/source/en/model_doc/tvlt.md
@@ -25,14 +25,6 @@ The abstract from the paper is the following:
 
 *In this work, we present the Textless Vision-Language Transformer (TVLT), where homogeneous transformer blocks take raw visual and audio inputs for vision-and-language representation learning with minimal modality-specific design, and do not use text-specific modules such as tokenization or automatic speech recognition (ASR). TVLT is trained by reconstructing masked patches of continuous video frames and audio spectrograms (masked autoencoding) and contrastive modeling to align video and audio. TVLT attains performance comparable to its text-based counterpart on various multimodal tasks, such as visual question answering, image retrieval, video retrieval, and multimodal sentiment analysis, with 28x faster inference speed and only 1/3 of the parameters. Our findings suggest the possibility of learning compact and efficient visual-linguistic representations from low-level visual and audio signals without assuming the prior existence of text.*
 
-Tips:
-
-- TVLT is a model that takes both `pixel_values` and `audio_values` as input. One can use [`TvltProcessor`] to prepare data for the model.
-  This processor wraps an image processor (for the image/video modality) and an audio feature extractor (for the audio modality) into one.
-- TVLT is trained with images/videos and audios of various sizes: the authors resize and crop the input images/videos to 224 and limit the length of audio spectrogram to 2048. To make batching of videos and audios possible, the authors use a `pixel_mask` that indicates which pixels are real/padding and `audio_mask` that indicates which audio values are real/padding.
-- The design of TVLT is very similar to that of a standard Vision Transformer (ViT) and masked autoencoder (MAE) as in [ViTMAE](vitmae). The difference is that the model includes embedding layers for the audio modality.
-- The PyTorch version of this model is only available in torch 1.10 and higher.
-
 <p align="center">
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/tvlt_architecture.png"
 alt="drawing" width="600"/>
@@ -42,6 +34,14 @@ alt="drawing" width="600"/>
 
 The original code can be found [here](https://github.com/zinengtang/TVLT). This model was contributed by [Zineng Tang](https://huggingface.co/ZinengTang).
 
+## Usage tips
+
+- TVLT is a model that takes both `pixel_values` and `audio_values` as input. One can use [`TvltProcessor`] to prepare data for the model.
+  This processor wraps an image processor (for the image/video modality) and an audio feature extractor (for the audio modality) into one.
+- TVLT is trained with images/videos and audios of various sizes: the authors resize and crop the input images/videos to 224 and limit the length of audio spectrogram to 2048. To make batching of videos and audios possible, the authors use a `pixel_mask` that indicates which pixels are real/padding and `audio_mask` that indicates which audio values are real/padding.
+- The design of TVLT is very similar to that of a standard Vision Transformer (ViT) and masked autoencoder (MAE) as in [ViTMAE](vitmae). The difference is that the model includes embedding layers for the audio modality.
+- The PyTorch version of this model is only available in torch 1.10 and higher.
+
 ## TvltConfig
 
 [[autodoc]] TvltConfig
diff --git a/docs/source/en/model_doc/ul2.md b/docs/source/en/model_doc/ul2.md
index 3863f23a7d7..f4d01c40b0c 100644
--- a/docs/source/en/model_doc/ul2.md
+++ b/docs/source/en/model_doc/ul2.md
@@ -24,12 +24,20 @@ The abstract from the paper is the following:
 
 *Existing pre-trained models are generally geared towards a particular class of problems. To date, there seems to be still no consensus on what the right architecture and pre-training setup should be. This paper presents a unified framework for pre-training models that are universally effective across datasets and setups. We begin by disentangling architectural archetypes with pre-training objectives -- two concepts that are commonly conflated. Next, we present a generalized and unified perspective for self-supervision in NLP and show how different pre-training objectives can be cast as one another and how interpolating between different objectives can be effective. We then propose Mixture-of-Denoisers (MoD), a pre-training objective that combines diverse pre-training paradigms together. We furthermore introduce a notion of mode switching, wherein downstream fine-tuning is associated with specific pre-training schemes. We conduct extensive ablative experiments to compare multiple pre-training objectives and find that our method pushes the Pareto-frontier by outperforming T5 and/or GPT-like models across multiple diverse setups. Finally, by scaling our model up to 20B parameters, we achieve SOTA performance on 50 well-established supervised NLP tasks ranging from language generation (with automated and human evaluation), language understanding, text classification, question answering, commonsense reasoning, long text reasoning, structured knowledge grounding and information retrieval. Our model also achieve strong results at in-context learning, outperforming 175B GPT-3 on zero-shot SuperGLUE and tripling the performance of T5-XXL on one-shot summarization.*
 
-Tips:
+This model was contributed by [DanielHesslow](https://huggingface.co/Seledorn). The original code can be found [here](https://github.com/google-research/google-research/tree/master/ul2).
+
+## Usage tips
 
 - UL2 is an encoder-decoder model pre-trained on a mixture of denoising functions as well as fine-tuned on an array of downstream tasks.
 - UL2 has the same architecture as [T5v1.1](t5v1.1) but uses the Gated-SiLU activation function instead of Gated-GELU.
 - The authors release checkpoints of one architecture which can be seen [here](https://huggingface.co/google/ul2)
 
-The original code can be found [here](https://github.com/google-research/google-research/tree/master/ul2).
+<Tip> 
+
+As UL2 has the same architecture as T5v1.1,  refer to [T5's documentation page](t5) for API reference, tips, code examples and notebooks.
+
+</Tip>
+
+
+
 
-This model was contributed by [DanielHesslow](https://huggingface.co/Seledorn).
diff --git a/docs/source/en/model_doc/umt5.md b/docs/source/en/model_doc/umt5.md
index 4e6375bd465..6a7498c2433 100644
--- a/docs/source/en/model_doc/umt5.md
+++ b/docs/source/en/model_doc/umt5.md
@@ -33,13 +33,6 @@ The abstract from the paper is the following:
 
 *Pretrained multilingual large language models have typically used heuristic temperature-based sampling to balance between different languages. However previous work has not systematically evaluated the efficacy of different pretraining language distributions across model scales. In this paper, we propose a new sampling method, UniMax, that delivers more uniform coverage of head languages while mitigating overfitting on tail languages by explicitly capping the number of repeats over each language's corpus. We perform an extensive series of ablations testing a range of sampling strategies on a suite of multilingual benchmarks, while varying model scale. We find that UniMax outperforms standard temperature-based sampling, and the benefits persist as scale increases. As part of our contribution, we release: (i) an improved and refreshed mC4 multilingual corpus consisting of 29 trillion characters across 107 languages, and (ii) a suite of pretrained umT5 model checkpoints trained with UniMax sampling.*
 
-Tips: 
-
-- UMT5 was only pre-trained on [mC4](https://huggingface.co/datasets/mc4) excluding any supervised training.
-Therefore, this model has to be fine-tuned before it is usable on a downstream task, unlike the original T5 model.
-- Since umT5 was pre-trained in an unsupervise manner, there's no real advantage to using a task prefix during single-task
-fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
-
 Google has released the following variants:
 
 - [google/umt5-small](https://huggingface.co/google/umt5-small)
@@ -50,7 +43,12 @@ Google has released the following variants:
 This model was contributed by [agemagician](https://huggingface.co/agemagician) and [stefan-it](https://huggingface.co/stefan-it). The original code can be
 found [here](https://github.com/google-research/t5x).
 
-One can refer to [T5's documentation page](t5) for more tips, code examples and notebooks.
+## Usage tips 
+
+- UMT5 was only pre-trained on [mC4](https://huggingface.co/datasets/mc4) excluding any supervised training.
+Therefore, this model has to be fine-tuned before it is usable on a downstream task, unlike the original T5 model.
+- Since umT5 was pre-trained in an unsupervise manner, there's no real advantage to using a task prefix during single-task
+fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
 
 ## Differences with mT5?
 `UmT5` is based on mT5, with a non-shared relative positional bias that is computed for each layer. This means that the model set `has_relative_bias` for each layer.
@@ -73,6 +71,11 @@ The conversion script is also different because the model was saved in t5x's lat
 ['<pad><extra_id_0>nyone who<extra_id_1> drink<extra_id_2> a<extra_id_3> alcohol<extra_id_4> A<extra_id_5> A. This<extra_id_6> I<extra_id_7><extra_id_52><extra_id_53></s>']
 ```
 
+<Tip> 
+
+Refer to [T5's documentation page](t5) for more tips, code examples and notebooks.
+</Tip>
+
 ## UMT5Config
 
 [[autodoc]] UMT5Config
diff --git a/docs/source/en/model_doc/unispeech-sat.md b/docs/source/en/model_doc/unispeech-sat.md
index 25489d9eeff..e2a21148115 100644
--- a/docs/source/en/model_doc/unispeech-sat.md
+++ b/docs/source/en/model_doc/unispeech-sat.md
@@ -37,7 +37,10 @@ state-of-the-art performance in universal representation learning, especially fo
 tasks. An ablation study is performed verifying the efficacy of each proposed method. Finally, we scale up training
 dataset to 94 thousand hours public audio data and achieve further performance improvement in all SUPERB tasks.*
 
-Tips:
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be
+found [here](https://github.com/microsoft/UniSpeech/tree/main/UniSpeech-SAT).
+
+## Usage tips
 
 - UniSpeechSat is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
   Please use [`Wav2Vec2Processor`] for the feature extraction.
@@ -45,10 +48,7 @@ Tips:
   decoded using [`Wav2Vec2CTCTokenizer`].
 - UniSpeechSat performs especially well on speaker verification, speaker identification, and speaker diarization tasks.
 
-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be
-found [here](https://github.com/microsoft/UniSpeech/tree/main/UniSpeech-SAT).
-
-## Documentation resources
+## Resources
 
 - [Audio classification task guide](../tasks/audio_classification)
 - [Automatic speech recognition task guide](../tasks/asr)
diff --git a/docs/source/en/model_doc/unispeech.md b/docs/source/en/model_doc/unispeech.md
index 8338aa1bda2..2b2b13bed52 100644
--- a/docs/source/en/model_doc/unispeech.md
+++ b/docs/source/en/model_doc/unispeech.md
@@ -33,17 +33,17 @@ recognition by a maximum of 13.4% and 17.8% relative phone error rate reductions
 testing languages). The transferability of UniSpeech is also demonstrated on a domain-shift speech recognition task,
 i.e., a relative word error rate reduction of 6% against the previous approach.*
 
-Tips:
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be
+found [here](https://github.com/microsoft/UniSpeech/tree/main/UniSpeech).
+
+## Usage tips
 
 - UniSpeech is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. Please
   use [`Wav2Vec2Processor`] for the feature extraction.
 - UniSpeech model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be
   decoded using [`Wav2Vec2CTCTokenizer`].
 
-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be
-found [here](https://github.com/microsoft/UniSpeech/tree/main/UniSpeech).
-
-## Documentation resources
+## Resources
 
 - [Audio classification task guide](../tasks/audio_classification)
 - [Automatic speech recognition task guide](../tasks/asr)
diff --git a/docs/source/en/model_doc/upernet.md b/docs/source/en/model_doc/upernet.md
index db651acaa40..418c3ef1786 100644
--- a/docs/source/en/model_doc/upernet.md
+++ b/docs/source/en/model_doc/upernet.md
@@ -33,17 +33,7 @@ alt="drawing" width="600"/>
 
 This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code is based on OpenMMLab's mmsegmentation [here](https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/uper_head.py).
 
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with UPerNet.
-
-- Demo notebooks for UPerNet can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/UPerNet).
-- [`UperNetForSemanticSegmentation`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/semantic-segmentation) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb).
-- See also: [Semantic segmentation task guide](../tasks/semantic_segmentation)
-
-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-## Usage
+## Usage examples
 
 UPerNet is a general framework for semantic segmentation. It can be used with any vision backbone, like so:
 
@@ -69,6 +59,16 @@ model = UperNetForSemanticSegmentation(config)
 
 Note that this will randomly initialize all the weights of the model.
 
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with UPerNet.
+
+- Demo notebooks for UPerNet can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/UPerNet).
+- [`UperNetForSemanticSegmentation`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/semantic-segmentation) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb).
+- See also: [Semantic segmentation task guide](../tasks/semantic_segmentation)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
 ## UperNetConfig
 
 [[autodoc]] UperNetConfig
diff --git a/docs/source/en/model_doc/van.md b/docs/source/en/model_doc/van.md
index b9539602d3b..83e4959b301 100644
--- a/docs/source/en/model_doc/van.md
+++ b/docs/source/en/model_doc/van.md
@@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
 
 <Tip warning={true}>
 
-This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
 
 If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
 You can do so by running the following command: `pip install -U transformers==4.30.0`.
@@ -60,13 +60,11 @@ If you're interested in submitting a resource to be included here, please feel f
 
 [[autodoc]] VanConfig
 
-
 ## VanModel
 
 [[autodoc]] VanModel
     - forward
 
-
 ## VanForImageClassification
 
 [[autodoc]] VanForImageClassification
diff --git a/docs/source/en/model_doc/videomae.md b/docs/source/en/model_doc/videomae.md
index 5a3620040ad..75eb9617380 100644
--- a/docs/source/en/model_doc/videomae.md
+++ b/docs/source/en/model_doc/videomae.md
@@ -25,11 +25,6 @@ The abstract from the paper is the following:
 
 *Pre-training video transformers on extra large-scale datasets is generally required to achieve premier performance on relatively small datasets. In this paper, we show that video masked autoencoders (VideoMAE) are data-efficient learners for self-supervised video pre-training (SSVP). We are inspired by the recent ImageMAE and propose customized video tube masking and reconstruction. These simple designs turn out to be effective for overcoming information leakage caused by the temporal correlation during video reconstruction. We obtain three important findings on SSVP: (1) An extremely high proportion of masking ratio (i.e., 90% to 95%) still yields favorable performance of VideoMAE. The temporally redundant video content enables higher masking ratio than that of images. (2) VideoMAE achieves impressive results on very small datasets (i.e., around 3k-4k videos) without using any extra data. This is partially ascribed to the challenging task of video reconstruction to enforce high-level structure learning. (3) VideoMAE shows that data quality is more important than data quantity for SSVP. Domain shift between pre-training and target datasets are important issues in SSVP. Notably, our VideoMAE with the vanilla ViT backbone can achieve 83.9% on Kinects-400, 75.3% on Something-Something V2, 90.8% on UCF101, and 61.1% on HMDB51 without using any extra data.*
 
-Tips:
-
-- One can use [`VideoMAEImageProcessor`] to prepare videos for the model. It will resize + normalize all frames of a video for you.
-- [`VideoMAEForPreTraining`] includes the decoder on top for self-supervised pre-training.
-
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/videomae_architecture.jpeg"
 alt="drawing" width="600"/>
 
@@ -50,7 +45,6 @@ to fine-tune a VideoMAE model on a custom dataset.
 - [Video classification task guide](../tasks/video_classification)
 - [A 🤗 Space](https://huggingface.co/spaces/sayakpaul/video-classification-ucf101-subset) showing how to perform inference with a video classification model.
 
-
 ## VideoMAEConfig
 
 [[autodoc]] VideoMAEConfig
@@ -72,6 +66,8 @@ to fine-tune a VideoMAE model on a custom dataset.
 
 ## VideoMAEForPreTraining
 
+`VideoMAEForPreTraining` includes the decoder on top for self-supervised pre-training.
+
 [[autodoc]] transformers.VideoMAEForPreTraining
     - forward
 
diff --git a/docs/source/en/model_doc/vilt.md b/docs/source/en/model_doc/vilt.md
index 2e2f4a140d2..2b0ac022da4 100644
--- a/docs/source/en/model_doc/vilt.md
+++ b/docs/source/en/model_doc/vilt.md
@@ -34,7 +34,14 @@ Vision-and-Language Transformer (ViLT), monolithic in the sense that the process
 simplified to just the same convolution-free manner that we process textual inputs. We show that ViLT is up to tens of
 times faster than previous VLP models, yet with competitive or better downstream task performance.*
 
-Tips:
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vilt_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> ViLT architecture. Taken from the <a href="https://arxiv.org/abs/2102.03334">original paper</a>. </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/dandelin/ViLT).
+
+## Usage tips
 
 - The quickest way to get started with ViLT is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/ViLT)
   (which showcase both inference and fine-tuning on custom data).
@@ -45,17 +52,6 @@ Tips:
   which pixel values are real and which are padding. [`ViltProcessor`] automatically creates this for you.
 - The design of ViLT is very similar to that of a standard Vision Transformer (ViT). The only difference is that the model includes
   additional embedding layers for the language modality.
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vilt_architecture.jpg"
-alt="drawing" width="600"/>
-
-<small> ViLT architecture. Taken from the <a href="https://arxiv.org/abs/2102.03334">original paper</a>. </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/dandelin/ViLT).
-
-
-Tips:
-
 - The PyTorch version of this model is only available in torch 1.10 and higher.
 
 ## ViltConfig
diff --git a/docs/source/en/model_doc/vision-encoder-decoder.md b/docs/source/en/model_doc/vision-encoder-decoder.md
index 0beeaeae108..89d89896a2e 100644
--- a/docs/source/en/model_doc/vision-encoder-decoder.md
+++ b/docs/source/en/model_doc/vision-encoder-decoder.md
@@ -151,20 +151,32 @@ were contributed by [ydshieh](https://github.com/ydshieh).
 
 [[autodoc]] VisionEncoderDecoderConfig
 
+<frameworkcontent>
+<pt>
+
 ## VisionEncoderDecoderModel
 
 [[autodoc]] VisionEncoderDecoderModel
     - forward
     - from_encoder_decoder_pretrained
 
+</pt>
+<tf>
+
 ## TFVisionEncoderDecoderModel
 
 [[autodoc]] TFVisionEncoderDecoderModel
     - call
     - from_encoder_decoder_pretrained
 
+</tf>
+<jax>
+
 ## FlaxVisionEncoderDecoderModel
 
 [[autodoc]] FlaxVisionEncoderDecoderModel
     - __call__
     - from_encoder_decoder_pretrained
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/vision-text-dual-encoder.md b/docs/source/en/model_doc/vision-text-dual-encoder.md
index 6fa9728cac4..7cb68a26187 100644
--- a/docs/source/en/model_doc/vision-text-dual-encoder.md
+++ b/docs/source/en/model_doc/vision-text-dual-encoder.md
@@ -36,17 +36,29 @@ new zero-shot vision tasks such as image classification or retrieval.
 
 [[autodoc]] VisionTextDualEncoderProcessor
 
+<frameworkcontent>
+<pt>
+
 ## VisionTextDualEncoderModel
 
 [[autodoc]] VisionTextDualEncoderModel
     - forward
 
+</pt>
+<tf>
+
 ## FlaxVisionTextDualEncoderModel
 
 [[autodoc]] FlaxVisionTextDualEncoderModel
     - __call__
 
+</tf>
+<jax>
+
 ## TFVisionTextDualEncoderModel
 
 [[autodoc]] TFVisionTextDualEncoderModel
     - call
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/visual_bert.md b/docs/source/en/model_doc/visual_bert.md
index 7d84c0d9fae..1db218f1a53 100644
--- a/docs/source/en/model_doc/visual_bert.md
+++ b/docs/source/en/model_doc/visual_bert.md
@@ -32,7 +32,9 @@ simpler. Further analysis demonstrates that VisualBERT can ground elements of la
 explicit supervision and is even sensitive to syntactic relationships, tracking, for example, associations between
 verbs and image regions corresponding to their arguments.*
 
-Tips:
+This model was contributed by [gchhablani](https://huggingface.co/gchhablani). The original code can be found [here](https://github.com/uclanlp/visualbert).
+
+## Usage tips
 
 1. Most of the checkpoints provided work with the [`VisualBertForPreTraining`] configuration. Other
    checkpoints provided are the fine-tuned checkpoints for down-stream tasks - VQA ('visualbert-vqa'), VCR
@@ -43,8 +45,6 @@ Tips:
    We do not provide the detector and its weights as a part of the package, but it will be available in the research
    projects, and the states can be loaded directly into the detector provided.
 
-## Usage
-
 VisualBERT is a multi-modal vision and language model. It can be used for visual question answering, multiple choice,
 visual reasoning and region-to-phrase correspondence tasks. VisualBERT uses a BERT-like transformer to prepare
 embeddings for image-text pairs. Both the text and visual features are then projected to a latent space with identical
@@ -92,8 +92,6 @@ The following example shows how to get the last hidden state using [`VisualBertM
 >>> last_hidden_state = outputs.last_hidden_state
 ```
 
-This model was contributed by [gchhablani](https://huggingface.co/gchhablani). The original code can be found [here](https://github.com/uclanlp/visualbert).
-
 ## VisualBertConfig
 
 [[autodoc]] VisualBertConfig
diff --git a/docs/source/en/model_doc/vit.md b/docs/source/en/model_doc/vit.md
index 409580d0948..25c3a6c8f53 100644
--- a/docs/source/en/model_doc/vit.md
+++ b/docs/source/en/model_doc/vit.md
@@ -24,7 +24,6 @@ Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minder
 Uszkoreit, Neil Houlsby. It's the first paper that successfully trains a Transformer encoder on ImageNet, attaining
 very good results compared to familiar convolutional architectures.
 
-
 The abstract from the paper is the following:
 
 *While the Transformer architecture has become the de-facto standard for natural language processing tasks, its
@@ -36,30 +35,6 @@ data and transferred to multiple mid-sized or small image recognition benchmarks
 Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring
 substantially fewer computational resources to train.*
 
-Tips:
-
-- Demo notebooks regarding inference as well as fine-tuning ViT on custom data can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer).
-- To feed images to the Transformer encoder, each image is split into a sequence of fixed-size non-overlapping patches,
-  which are then linearly embedded. A [CLS] token is added to serve as representation of an entire image, which can be
-  used for classification. The authors also add absolute position embeddings, and feed the resulting sequence of
-  vectors to a standard Transformer encoder.
-- As the Vision Transformer expects each image to be of the same size (resolution), one can use
-  [`ViTImageProcessor`] to resize (or rescale) and normalize images for the model.
-- Both the patch resolution and image resolution used during pre-training or fine-tuning are reflected in the name of
-  each checkpoint. For example, `google/vit-base-patch16-224` refers to a base-sized architecture with patch
-  resolution of 16x16 and fine-tuning resolution of 224x224. All checkpoints can be found on the [hub](https://huggingface.co/models?search=vit).
-- The available checkpoints are either (1) pre-trained on [ImageNet-21k](http://www.image-net.org/) (a collection of
-  14 million images and 21k classes) only, or (2) also fine-tuned on [ImageNet](http://www.image-net.org/challenges/LSVRC/2012/) (also referred to as ILSVRC 2012, a collection of 1.3 million
-  images and 1,000 classes).
-- The Vision Transformer was pre-trained using a resolution of 224x224. During fine-tuning, it is often beneficial to
-  use a higher resolution than pre-training [(Touvron et al., 2019)](https://arxiv.org/abs/1906.06423), [(Kolesnikov
-  et al., 2020)](https://arxiv.org/abs/1912.11370). In order to fine-tune at higher resolution, the authors perform
-  2D interpolation of the pre-trained position embeddings, according to their location in the original image.
-- The best results are obtained with supervised pre-training, which is not the case in NLP. The authors also performed
-  an experiment with a self-supervised pre-training objective, namely masked patched prediction (inspired by masked
-  language modeling). With this approach, the smaller ViT-B/16 model achieves 79.9% accuracy on ImageNet, a significant
-  improvement of 2% to training from scratch, but still 4% behind supervised pre-training.
-
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vit_architecture.jpg"
 alt="drawing" width="600"/>
 
@@ -87,28 +62,35 @@ Following the original Vision Transformer, some follow-up works have been made:
 This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code (written in JAX) can be
 found [here](https://github.com/google-research/vision_transformer).
 
-Note that we converted the weights from Ross Wightman's [timm library](https://github.com/rwightman/pytorch-image-models), who already converted the weights from JAX to PyTorch. Credits
-go to him!
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT.
-
-<PipelineTag pipeline="image-classification"/>
-
-- [`ViTForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
-- A blog on fine-tuning [`ViTForImageClassification`] on a custom dataset can be found [here](https://huggingface.co/blog/fine-tune-vit).
-- More demo notebooks to fine-tune [`ViTForImageClassification`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer).
-- [Image classification task guide](../tasks/image_classification)
-
-Besides that:
-
-- [`ViTForMaskedImageModeling`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).
-
-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+Note that we converted the weights from Ross Wightman's [timm library](https://github.com/rwightman/pytorch-image-models), 
+who already converted the weights from JAX to PyTorch. Credits go to him!
+
+## Usage tips
+
+- To feed images to the Transformer encoder, each image is split into a sequence of fixed-size non-overlapping patches,
+  which are then linearly embedded. A [CLS] token is added to serve as representation of an entire image, which can be
+  used for classification. The authors also add absolute position embeddings, and feed the resulting sequence of
+  vectors to a standard Transformer encoder.
+- As the Vision Transformer expects each image to be of the same size (resolution), one can use
+  [`ViTImageProcessor`] to resize (or rescale) and normalize images for the model.
+- Both the patch resolution and image resolution used during pre-training or fine-tuning are reflected in the name of
+  each checkpoint. For example, `google/vit-base-patch16-224` refers to a base-sized architecture with patch
+  resolution of 16x16 and fine-tuning resolution of 224x224. All checkpoints can be found on the [hub](https://huggingface.co/models?search=vit).
+- The available checkpoints are either (1) pre-trained on [ImageNet-21k](http://www.image-net.org/) (a collection of
+  14 million images and 21k classes) only, or (2) also fine-tuned on [ImageNet](http://www.image-net.org/challenges/LSVRC/2012/) (also referred to as ILSVRC 2012, a collection of 1.3 million
+  images and 1,000 classes).
+- The Vision Transformer was pre-trained using a resolution of 224x224. During fine-tuning, it is often beneficial to
+  use a higher resolution than pre-training [(Touvron et al., 2019)](https://arxiv.org/abs/1906.06423), [(Kolesnikov
+  et al., 2020)](https://arxiv.org/abs/1912.11370). In order to fine-tune at higher resolution, the authors perform
+  2D interpolation of the pre-trained position embeddings, according to their location in the original image.
+- The best results are obtained with supervised pre-training, which is not the case in NLP. The authors also performed
+  an experiment with a self-supervised pre-training objective, namely masked patched prediction (inspired by masked
+  language modeling). With this approach, the smaller ViT-B/16 model achieves 79.9% accuracy on ImageNet, a significant
+  improvement of 2% to training from scratch, but still 4% behind supervised pre-training.
 
 ## Resources
 
+Demo notebooks regarding inference as well as fine-tuning ViT on custom data can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer).
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
 
 `ViTForImageClassification` is supported by:
@@ -134,7 +116,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 - A blog post on [Deploying Hugging Face ViT on Vertex AI](https://huggingface.co/blog/deploy-vertex-ai)
 - A blog post on [Deploying Hugging Face ViT on Kubernetes with TF Serving](https://huggingface.co/blog/deploy-tfserving-kubernetes)
 
-
 ## ViTConfig
 
 [[autodoc]] ViTConfig
@@ -144,12 +125,14 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] ViTFeatureExtractor
     - __call__
 
-
 ## ViTImageProcessor
 
 [[autodoc]] ViTImageProcessor
     - preprocess
 
+<frameworkcontent>
+<pt>
+
 ## ViTModel
 
 [[autodoc]] ViTModel
@@ -165,6 +148,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] ViTForImageClassification
     - forward
 
+</pt>
+<tf>
+
 ## TFViTModel
 
 [[autodoc]] TFViTModel
@@ -175,6 +161,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] TFViTForImageClassification
     - call
 
+</tf>
+<jax>
+
 ## FlaxVitModel
 
 [[autodoc]] FlaxViTModel
@@ -184,3 +173,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] FlaxViTForImageClassification
     - __call__
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/vit_hybrid.md b/docs/source/en/model_doc/vit_hybrid.md
index 84969cd0f62..52c0d35bc13 100644
--- a/docs/source/en/model_doc/vit_hybrid.md
+++ b/docs/source/en/model_doc/vit_hybrid.md
@@ -25,7 +25,6 @@ Uszkoreit, Neil Houlsby. It's the first paper that successfully trains a Transfo
 very good results compared to familiar convolutional architectures. ViT hybrid is a slight variant of the [plain Vision Transformer](vit),
 by leveraging a convolutional backbone (specifically, [BiT](bit)) whose features are used as initial "tokens" for the Transformer.
 
-
 The abstract from the paper is the following:
 
 *While the Transformer architecture has become the de-facto standard for natural language processing tasks, its
@@ -40,7 +39,6 @@ substantially fewer computational resources to train.*
 This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code (written in JAX) can be
 found [here](https://github.com/google-research/vision_transformer).
 
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT Hybrid.
@@ -52,7 +50,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
 
-
 ## ViTHybridConfig
 
 [[autodoc]] ViTHybridConfig
diff --git a/docs/source/en/model_doc/vit_mae.md b/docs/source/en/model_doc/vit_mae.md
index c14cc7e57c9..27d6d26816a 100644
--- a/docs/source/en/model_doc/vit_mae.md
+++ b/docs/source/en/model_doc/vit_mae.md
@@ -32,7 +32,15 @@ enables us to train large models efficiently and effectively: we accelerate trai
 models that generalize well: e.g., a vanilla ViT-Huge model achieves the best accuracy (87.8%) among methods that use only ImageNet-1K data. Transfer performance in downstream
 tasks outperforms supervised pre-training and shows promising scaling behavior.*
 
-Tips:
+<img src="https://user-images.githubusercontent.com/11435359/146857310-f258c86c-fde6-48e8-9cee-badd2b21bd2c.png"
+alt="drawing" width="600"/> 
+
+<small> MAE architecture. Taken from the <a href="https://arxiv.org/abs/2111.06377">original paper.</a> </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). TensorFlow version of the model was contributed by [sayakpaul](https://github.com/sayakpaul) and 
+[ariG23498](https://github.com/ariG23498) (equal contribution). The original code can be found [here](https://github.com/facebookresearch/mae). 
+
+## Usage tips
 
 - MAE (masked auto encoding) is a method for self-supervised pre-training of Vision Transformers (ViTs). The pre-training objective is relatively simple:
 by masking a large portion (75%) of the image patches, the model must reconstruct raw pixel values. One can use [`ViTMAEForPreTraining`] for this purpose.
@@ -44,14 +52,6 @@ consists of Transformer blocks) takes as input. Each mask token is a shared, lea
 sin/cos position embeddings are added both to the input of the encoder and the decoder.
 - For a visual understanding of how MAEs work you can check out this [post](https://keras.io/examples/vision/masked_image_modeling/).
 
-<img src="https://user-images.githubusercontent.com/11435359/146857310-f258c86c-fde6-48e8-9cee-badd2b21bd2c.png"
-alt="drawing" width="600"/> 
-
-<small> MAE architecture. Taken from the <a href="https://arxiv.org/abs/2111.06377">original paper.</a> </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). TensorFlow version of the model was contributed by [sayakpaul](https://github.com/sayakpaul) and 
-[ariG23498](https://github.com/ariG23498) (equal contribution). The original code can be found [here](https://github.com/facebookresearch/mae). 
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViTMAE.
@@ -65,26 +65,31 @@ If you're interested in submitting a resource to be included here, please feel f
 
 [[autodoc]] ViTMAEConfig
 
+<frameworkcontent>
+<pt>
 
 ## ViTMAEModel
 
 [[autodoc]] ViTMAEModel
     - forward
 
-
 ## ViTMAEForPreTraining
 
 [[autodoc]] transformers.ViTMAEForPreTraining
     - forward
 
+</pt>
+<tf>
 
 ## TFViTMAEModel
 
 [[autodoc]] TFViTMAEModel
     - call
 
-
 ## TFViTMAEForPreTraining
 
 [[autodoc]] transformers.TFViTMAEForPreTraining
     - call
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/vit_msn.md b/docs/source/en/model_doc/vit_msn.md
index ded0245194f..666b7dd0dfd 100644
--- a/docs/source/en/model_doc/vit_msn.md
+++ b/docs/source/en/model_doc/vit_msn.md
@@ -33,7 +33,13 @@ while producing representations of a high semantic level that perform competitiv
 on ImageNet-1K, with only 5,000 annotated images, our base MSN model achieves 72.4% top-1 accuracy,
 and with 1% of ImageNet-1K labels, we achieve 75.7% top-1 accuracy, setting a new state-of-the-art for self-supervised learning on this benchmark.*
 
-Tips:
+<img src="https://i.ibb.co/W6PQMdC/Screenshot-2022-09-13-at-9-08-40-AM.png" alt="drawing" width="600"/> 
+
+<small> MSN architecture. Taken from the <a href="https://arxiv.org/abs/2204.07141">original paper.</a> </small>
+
+This model was contributed by [sayakpaul](https://huggingface.co/sayakpaul). The original code can be found [here](https://github.com/facebookresearch/msn). 
+
+## Usage tips
 
 - MSN (masked siamese networks) is a method for self-supervised pre-training of Vision Transformers (ViTs). The pre-training
 objective is to match the prototypes assigned to the unmasked views of the images to that of the masked views of the same images.
@@ -43,13 +49,6 @@ use the [`ViTMSNForImageClassification`] class which is initialized from [`ViTMS
 - MSN is particularly useful in the low-shot and extreme low-shot regimes. Notably, it achieves 75.7% top-1 accuracy with only 1% of ImageNet-1K
 labels when fine-tuned.
 
-
-<img src="https://i.ibb.co/W6PQMdC/Screenshot-2022-09-13-at-9-08-40-AM.png" alt="drawing" width="600"/> 
-
-<small> MSN architecture. Taken from the <a href="https://arxiv.org/abs/2204.07141">original paper.</a> </small>
-
-This model was contributed by [sayakpaul](https://huggingface.co/sayakpaul). The original code can be found [here](https://github.com/facebookresearch/msn). 
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT MSN.
@@ -65,13 +64,11 @@ If you're interested in submitting a resource to be included here, please feel f
 
 [[autodoc]] ViTMSNConfig
 
-
 ## ViTMSNModel
 
 [[autodoc]] ViTMSNModel
     - forward
 
-
 ## ViTMSNForImageClassification
 
 [[autodoc]] ViTMSNForImageClassification
diff --git a/docs/source/en/model_doc/vitdet.md b/docs/source/en/model_doc/vitdet.md
index 657e467ee31..81bf787d6cd 100644
--- a/docs/source/en/model_doc/vitdet.md
+++ b/docs/source/en/model_doc/vitdet.md
@@ -21,13 +21,12 @@ The abstract from the paper is the following:
 
 *We explore the plain, non-hierarchical Vision Transformer (ViT) as a backbone network for object detection. This design enables the original ViT architecture to be fine-tuned for object detection without needing to redesign a hierarchical backbone for pre-training. With minimal adaptations for fine-tuning, our plain-backbone detector can achieve competitive results. Surprisingly, we observe: (i) it is sufficient to build a simple feature pyramid from a single-scale feature map (without the common FPN design) and (ii) it is sufficient to use window attention (without shifting) aided with very few cross-window propagation blocks. With plain ViT backbones pre-trained as Masked Autoencoders (MAE), our detector, named ViTDet, can compete with the previous leading methods that were all based on hierarchical backbones, reaching up to 61.3 AP_box on the COCO dataset using only ImageNet-1K pre-training. We hope our study will draw attention to research on plain-backbone detectors.*
 
-Tips:
-
-- For the moment, only the backbone is available.
-
 This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/facebookresearch/detectron2/tree/main/projects/ViTDet).
 
+Tips:
+
+- At the moment, only the backbone is available.
 
 ## VitDetConfig
 
diff --git a/docs/source/en/model_doc/vitmatte.md b/docs/source/en/model_doc/vitmatte.md
index 479b398f806..5a6d501030f 100644
--- a/docs/source/en/model_doc/vitmatte.md
+++ b/docs/source/en/model_doc/vitmatte.md
@@ -21,10 +21,6 @@ The abstract from the paper is the following:
 
 *Recently, plain vision Transformers (ViTs) have shown impressive performance on various computer vision tasks, thanks to their strong modeling capacity and large-scale pretraining. However, they have not yet conquered the problem of image matting. We hypothesize that image matting could also be boosted by ViTs and present a new efficient and robust ViT-based matting system, named ViTMatte. Our method utilizes (i) a hybrid attention mechanism combined with a convolution neck to help ViTs achieve an excellent performance-computation trade-off in matting tasks. (ii) Additionally, we introduce the detail capture module, which just consists of simple lightweight convolutions to complement the detailed information required by matting. To the best of our knowledge, ViTMatte is the first work to unleash the potential of ViT on image matting with concise adaptation. It inherits many superior properties from ViT to matting, including various pretraining strategies, concise architecture design, and flexible inference strategies. We evaluate ViTMatte on Composition-1k and Distinctions-646, the most commonly used benchmark for image matting, our method achieves state-of-the-art performance and outperforms prior matting works by a large margin.*
 
-Tips:
-
-- The model expects both the image and trimap (concatenated) as input. One can use [`ViTMatteImageProcessor`] for this purpose.
-
 This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/hustvl/ViTMatte).
 
@@ -39,6 +35,10 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 - A demo notebook regarding inference with [`VitMatteForImageMatting`], including background replacement, can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/ViTMatte).
 
+<Tip>
+
+The model expects both the image and trimap (concatenated) as input. Use [`ViTMatteImageProcessor`] for this purpose.
+</Tip>
 
 ## VitMatteConfig
 
diff --git a/docs/source/en/model_doc/vits.md b/docs/source/en/model_doc/vits.md
index 1b57df4027d..73001d82ed5 100644
--- a/docs/source/en/model_doc/vits.md
+++ b/docs/source/en/model_doc/vits.md
@@ -16,7 +16,6 @@ specific language governing permissions and limitations under the License.
 
 The VITS model was proposed in [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
 
-
 VITS (**V**ariational **I**nference with adversarial learning for end-to-end **T**ext-to-**S**peech) is an end-to-end 
 speech synthesis model that predicts a speech waveform conditional on an input text sequence. It is a conditional variational 
 autoencoder (VAE) comprised of a posterior encoder, decoder, and conditional prior.
@@ -42,7 +41,7 @@ as these checkpoints use the same architecture and a slightly modified tokenizer
 
 This model was contributed by [Matthijs](https://huggingface.co/Matthijs) and [sanchit-gandhi](https://huggingface.co/sanchit-gandhi). The original code can be found [here](https://github.com/jaywalnut310/vits).
 
-## Model Usage
+## Usage examples
 
 Both the VITS and MMS-TTS checkpoints can be used with the same API. Since the flow-based model is non-deterministic, it 
 is good practice to set a seed to ensure reproducibility of the outputs. For languages with a Roman alphabet, 
diff --git a/docs/source/en/model_doc/vivit.md b/docs/source/en/model_doc/vivit.md
index 755629a7675..4426493a0ff 100644
--- a/docs/source/en/model_doc/vivit.md
+++ b/docs/source/en/model_doc/vivit.md
@@ -21,7 +21,6 @@ The abstract from the paper is the following:
 
 *We present pure-transformer based models for video classification, drawing upon the recent success of such models in image classification. Our model extracts spatio-temporal tokens from the input video, which are then encoded by a series of transformer layers. In order to handle the long sequences of tokens encountered in video, we propose several, efficient variants of our model which factorise the spatial- and temporal-dimensions of the input. Although transformer-based models are known to only be effective when large training datasets are available, we show how we can effectively regularise the model during training and leverage pretrained image models to be able to train on comparatively small datasets. We conduct thorough ablation studies, and achieve state-of-the-art results on multiple video classification benchmarks including Kinetics 400 and 600, Epic Kitchens, Something-Something v2 and Moments in Time, outperforming prior methods based on deep 3D convolutional networks.*
 
-
 This model was contributed by [jegormeister](https://huggingface.co/jegormeister). The original code (written in JAX) can be found [here](https://github.com/google-research/scenic/tree/main/scenic/projects/vivit).
 
 ## VivitConfig
diff --git a/docs/source/en/model_doc/wav2vec2-conformer.md b/docs/source/en/model_doc/wav2vec2-conformer.md
index 87e255cd0c6..c32c03bb0cb 100644
--- a/docs/source/en/model_doc/wav2vec2-conformer.md
+++ b/docs/source/en/model_doc/wav2vec2-conformer.md
@@ -24,7 +24,10 @@ The official results of the model can be found in Table 3 and Table 4 of the pap
 
 The Wav2Vec2-Conformer weights were released by the Meta AI team within the [Fairseq library](https://github.com/pytorch/fairseq/blob/main/examples/wav2vec/README.md#pre-trained-models).
 
-Tips:
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).
+The original code can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec).
+
+## Usage tips
 
 - Wav2Vec2-Conformer follows the same architecture as Wav2Vec2, but replaces the *Attention*-block with a *Conformer*-block
   as introduced in [Conformer: Convolution-augmented Transformer for Speech Recognition](https://arxiv.org/abs/2005.08100).
@@ -34,10 +37,7 @@ an improved word error rate.
 - Wav2Vec2-Conformer can use either no relative position embeddings, Transformer-XL-like position embeddings, or
   rotary position embeddings by setting the correct `config.position_embeddings_type`.
 
-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).
-The original code can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec).
-
-## Documentation resources
+## Resources
 
 - [Audio classification task guide](../tasks/audio_classification)
 - [Automatic speech recognition task guide](../tasks/asr)
diff --git a/docs/source/en/model_doc/wav2vec2.md b/docs/source/en/model_doc/wav2vec2.md
index 3a67f66d9d1..81d8f332ace 100644
--- a/docs/source/en/model_doc/wav2vec2.md
+++ b/docs/source/en/model_doc/wav2vec2.md
@@ -31,14 +31,14 @@ of the art on the 100 hour subset while using 100 times less labeled data. Using
 pre-training on 53k hours of unlabeled data still achieves 4.8/8.2 WER. This demonstrates the feasibility of speech
 recognition with limited amounts of labeled data.*
 
-Tips:
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).
+
+## Usage tips
 
 - Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
 - Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be decoded
   using [`Wav2Vec2CTCTokenizer`].
 
-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Wav2Vec2. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
@@ -167,6 +167,9 @@ Otherwise, [`~Wav2Vec2ProcessorWithLM.batch_decode`] performance will be slower
 
 [[autodoc]] models.wav2vec2.modeling_flax_wav2vec2.FlaxWav2Vec2ForPreTrainingOutput
 
+<frameworkcontent>
+<pt>
+
 ## Wav2Vec2Model
 
 [[autodoc]] Wav2Vec2Model
@@ -198,6 +201,9 @@ Otherwise, [`~Wav2Vec2ProcessorWithLM.batch_decode`] performance will be slower
 [[autodoc]] Wav2Vec2ForPreTraining
     - forward
 
+</pt>
+<tf>
+
 ## TFWav2Vec2Model
 
 [[autodoc]] TFWav2Vec2Model
@@ -213,6 +219,9 @@ Otherwise, [`~Wav2Vec2ProcessorWithLM.batch_decode`] performance will be slower
 [[autodoc]] TFWav2Vec2ForCTC
     - call
 
+</tf>
+<jax>
+
 ## FlaxWav2Vec2Model
 
 [[autodoc]] FlaxWav2Vec2Model
@@ -227,3 +236,6 @@ Otherwise, [`~Wav2Vec2ProcessorWithLM.batch_decode`] performance will be slower
 
 [[autodoc]] FlaxWav2Vec2ForPreTraining
     - __call__
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/wav2vec2_phoneme.md b/docs/source/en/model_doc/wav2vec2_phoneme.md
index a852bef637b..93e0656f493 100644
--- a/docs/source/en/model_doc/wav2vec2_phoneme.md
+++ b/docs/source/en/model_doc/wav2vec2_phoneme.md
@@ -31,7 +31,13 @@ mapping phonemes of the training languages to the target language using articula
 this simple method significantly outperforms prior work which introduced task-specific architectures and used only part
 of a monolingually pretrained model.*
 
-Tips:
+Relevant checkpoints can be found under https://huggingface.co/models?other=phoneme-recognition.
+
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten)
+
+The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec).
+
+## Usage tips
 
 - Wav2Vec2Phoneme uses the exact same architecture as Wav2Vec2
 - Wav2Vec2Phoneme is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
@@ -39,17 +45,16 @@ Tips:
   decoded using [`Wav2Vec2PhonemeCTCTokenizer`].
 - Wav2Vec2Phoneme can be fine-tuned on multiple language at once and decode unseen languages in a single forward pass
   to a sequence of phonemes
-- By default the model outputs a sequence of phonemes. In order to transform the phonemes to a sequence of words one
+- By default, the model outputs a sequence of phonemes. In order to transform the phonemes to a sequence of words one
   should make use of a dictionary and language model.
 
-Relevant checkpoints can be found under https://huggingface.co/models?other=phoneme-recognition.
 
-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten)
+<Tip>
 
-The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec).
-
-Wav2Vec2Phoneme's architecture is based on the Wav2Vec2 model, so one can refer to [`Wav2Vec2`]'s documentation page except for the tokenizer.
+Wav2Vec2Phoneme's architecture is based on the Wav2Vec2 model, for API reference, check out [`Wav2Vec2`](wav2vec2)'s documentation page 
+except for the tokenizer.
 
+</Tip>
 
 ## Wav2Vec2PhonemeCTCTokenizer
 
diff --git a/docs/source/en/model_doc/wavlm.md b/docs/source/en/model_doc/wavlm.md
index 2754304d826..13f62980756 100644
--- a/docs/source/en/model_doc/wavlm.md
+++ b/docs/source/en/model_doc/wavlm.md
@@ -35,7 +35,12 @@ additional overlapped utterances are created unsupervisely and incorporated duri
 the training dataset from 60k hours to 94k hours. WavLM Large achieves state-of-the-art performance on the SUPERB
 benchmark, and brings significant improvements for various speech processing tasks on their representative benchmarks.*
 
-Tips:
+Relevant checkpoints can be found under https://huggingface.co/models?other=wavlm.
+
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be
+found [here](https://github.com/microsoft/unilm/tree/master/wavlm).
+
+## Usage tips
 
 - WavLM is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. Please use
   [`Wav2Vec2Processor`] for the feature extraction.
@@ -43,12 +48,7 @@ Tips:
   using [`Wav2Vec2CTCTokenizer`].
 - WavLM performs especially well on speaker verification, speaker identification, and speaker diarization tasks.
 
-Relevant checkpoints can be found under https://huggingface.co/models?other=wavlm.
-
-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be
-found [here](https://github.com/microsoft/unilm/tree/master/wavlm).
-
-## Documentation resources
+## Resources
 
 - [Audio classification task guide](../tasks/audio_classification)
 - [Automatic speech recognition task guide](../tasks/asr)
diff --git a/docs/source/en/model_doc/whisper.md b/docs/source/en/model_doc/whisper.md
index 2f1cfc5e22b..4ea7e943813 100644
--- a/docs/source/en/model_doc/whisper.md
+++ b/docs/source/en/model_doc/whisper.md
@@ -24,18 +24,16 @@ The abstract from the paper is the following:
 
 *We study the capabilities of speech processing systems trained simply to predict large amounts of transcripts of audio on the internet. When scaled to 680,000 hours of multilingual and multitask supervision, the resulting models generalize well to standard benchmarks and are often competitive with prior fully supervised results but in a zeroshot transfer setting without the need for any finetuning. When compared to humans, the models approach their accuracy and robustness. We are releasing models and inference code to serve as a foundation for further work on robust speech processing.*
 
+This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ). The Tensorflow version of this model was contributed by [amyeroberts](https://huggingface.co/amyeroberts).
+The original code can be found [here](https://github.com/openai/whisper).
 
-Tips:
+## Usage tips
 
 - The model usually performs well without requiring any finetuning.
 - The architecture follows a classic encoder-decoder architecture, which means that it relies on the [`~generation.GenerationMixin.generate`] function for inference.
 - Inference is currently only implemented for short-form i.e. audio is pre-segmented into <=30s segments. Long-form (including timestamps) will be implemented in a future release.
 - One can use [`WhisperProcessor`] to prepare audio for the model, and decode the predicted ID's back into text.
 
-This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ). The Tensorflow version of this model was contributed by [amyeroberts](https://huggingface.co/amyeroberts).
-The original code can be found [here](https://github.com/openai/whisper).
-
-
 ## WhisperConfig
 
 [[autodoc]] WhisperConfig
@@ -76,6 +74,9 @@ The original code can be found [here](https://github.com/openai/whisper).
     - batch_decode
     - decode
 
+<frameworkcontent>
+<pt>
+
 ## WhisperModel
 
 [[autodoc]] WhisperModel
@@ -98,6 +99,8 @@ The original code can be found [here](https://github.com/openai/whisper).
 [[autodoc]] WhisperForAudioClassification
     - forward
 
+</pt>
+<tf>
 
 ## TFWhisperModel
 
@@ -109,6 +112,8 @@ The original code can be found [here](https://github.com/openai/whisper).
 [[autodoc]] TFWhisperForConditionalGeneration
     - call
 
+</tf>
+<jax>
 
 ## FlaxWhisperModel
 
@@ -125,3 +130,6 @@ The original code can be found [here](https://github.com/openai/whisper).
 [[autodoc]] FlaxWhisperForAudioClassification
     - __call__
 
+</jax>
+</frameworkcontent>
+
diff --git a/docs/source/en/model_doc/xglm.md b/docs/source/en/model_doc/xglm.md
index 1b184c17e80..470e42c747b 100644
--- a/docs/source/en/model_doc/xglm.md
+++ b/docs/source/en/model_doc/xglm.md
@@ -42,7 +42,7 @@ in social value tasks such as hate speech detection in five languages and find i
 
 This model was contributed by [Suraj](https://huggingface.co/valhalla). The original code can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/xglm).
 
-## Documentation resources
+## Resources
 
 - [Causal language modeling task guide](../tasks/language_modeling)
 
@@ -62,6 +62,9 @@ This model was contributed by [Suraj](https://huggingface.co/valhalla). The orig
 
 [[autodoc]] XGLMTokenizerFast
 
+<frameworkcontent>
+<pt>
+
 ## XGLMModel
 
 [[autodoc]] XGLMModel
@@ -72,6 +75,9 @@ This model was contributed by [Suraj](https://huggingface.co/valhalla). The orig
 [[autodoc]] XGLMForCausalLM
     - forward
 
+</pt>
+<tf>
+
 ## TFXGLMModel
 
 [[autodoc]] TFXGLMModel
@@ -82,6 +88,9 @@ This model was contributed by [Suraj](https://huggingface.co/valhalla). The orig
 [[autodoc]] TFXGLMForCausalLM
     - call
 
+</tf>
+<jax>
+
 ## FlaxXGLMModel
 
 [[autodoc]] FlaxXGLMModel
@@ -90,4 +99,7 @@ This model was contributed by [Suraj](https://huggingface.co/valhalla). The orig
 ## FlaxXGLMForCausalLM
 
 [[autodoc]] FlaxXGLMForCausalLM
-    - __call__
\ No newline at end of file
+    - __call__
+
+</jax>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/en/model_doc/xlm-prophetnet.md b/docs/source/en/model_doc/xlm-prophetnet.md
index 5e7ba5b7e3f..7a61aeb3e34 100644
--- a/docs/source/en/model_doc/xlm-prophetnet.md
+++ b/docs/source/en/model_doc/xlm-prophetnet.md
@@ -36,7 +36,7 @@ Zhang, Ming Zhou on 13 Jan, 2020.
 
 XLM-ProphetNet is an encoder-decoder model and can predict n-future tokens for "ngram" language modeling instead of
 just the next token. Its architecture is identical to ProhpetNet, but the model was trained on the multi-lingual
-"wiki100" Wikipedia dump.
+"wiki100" Wikipedia dump. XLM-ProphetNet's model architecture and pretraining objective is same as ProphetNet, but XLM-ProphetNet was pre-trained on the cross-lingual dataset XGLUE.
 
 The abstract from the paper is the following:
 
@@ -52,11 +52,7 @@ state-of-the-art results on all these datasets compared to the models using the
 
 The Authors' code can be found [here](https://github.com/microsoft/ProphetNet).
 
-Tips:
-
-- XLM-ProphetNet's model architecture and pretraining objective is same as ProphetNet, but XLM-ProphetNet was pre-trained on the cross-lingual dataset XGLUE.
-
-## Documentation resources
+## Resources
 
 - [Causal language modeling task guide](../tasks/language_modeling)
 - [Translation task guide](../tasks/translation)
diff --git a/docs/source/en/model_doc/xlm-roberta-xl.md b/docs/source/en/model_doc/xlm-roberta-xl.md
index b6592946070..f9cb78c0bf4 100644
--- a/docs/source/en/model_doc/xlm-roberta-xl.md
+++ b/docs/source/en/model_doc/xlm-roberta-xl.md
@@ -24,15 +24,15 @@ The abstract from the paper is the following:
 
 *Recent work has demonstrated the effectiveness of cross-lingual language model pretraining for cross-lingual understanding. In this study, we present the results of two larger multilingual masked language models, with 3.5B and 10.7B parameters. Our two new models dubbed XLM-R XL and XLM-R XXL outperform XLM-R by 1.8% and 2.4% average accuracy on XNLI. Our model also outperforms the RoBERTa-Large model on several English tasks of the GLUE benchmark by 0.3% on average while handling 99 more languages. This suggests pretrained models with larger capacity may obtain both strong performance on high-resource languages while greatly improving low-resource languages. We make our code and models publicly available.*
 
-Tips:
-
-- XLM-RoBERTa-XL is a multilingual model trained on 100 different languages. Unlike some XLM multilingual models, it does
-  not require `lang` tensors to understand which language is used, and should be able to determine the correct
-  language from the input ids.
-
 This model was contributed by [Soonhwan-Kwon](https://github.com/Soonhwan-Kwon) and [stefan-it](https://huggingface.co/stefan-it). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/xlmr).
 
-## Documentation resources
+## Usage tips
+
+XLM-RoBERTa-XL is a multilingual model trained on 100 different languages. Unlike some XLM multilingual models, it does 
+not require `lang` tensors to understand which language is used, and should be able to determine the correct 
+language from the input ids.
+
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
diff --git a/docs/source/en/model_doc/xlm-roberta.md b/docs/source/en/model_doc/xlm-roberta.md
index 935003156fd..58540015232 100644
--- a/docs/source/en/model_doc/xlm-roberta.md
+++ b/docs/source/en/model_doc/xlm-roberta.md
@@ -46,16 +46,14 @@ languages at scale. Finally, we show, for the first time, the possibility of mul
 per-language performance; XLM-Ris very competitive with strong monolingual models on the GLUE and XNLI benchmarks. We
 will make XLM-R code, data, and models publicly available.*
 
-Tips:
+This model was contributed by [stefan-it](https://huggingface.co/stefan-it). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/xlmr).
+
+## Usage tips
 
 - XLM-RoBERTa is a multilingual model trained on 100 different languages. Unlike some XLM multilingual models, it does
   not require `lang` tensors to understand which language is used, and should be able to determine the correct
   language from the input ids.
 - Uses RoBERTa tricks on the XLM approach, but does not use the translation language modeling objective. It only uses masked language modeling on sentences coming from one language.
-- This implementation is the same as RoBERTa. Refer to the [documentation of RoBERTa](roberta) for usage examples
-  as well as the information relative to the inputs and outputs.
-
-This model was contributed by [stefan-it](https://huggingface.co/stefan-it). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/xlmr).
 
 ## Resources
 
@@ -110,6 +108,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 - A blog post on how to [Deploy Serverless XLM RoBERTa on AWS Lambda](https://www.philschmid.de/multilingual-serverless-xlm-roberta-with-huggingface).
 
+<Tip> 
+
+This implementation is the same as RoBERTa. Refer to the [documentation of RoBERTa](roberta) for usage examples as well as the information relative to the inputs and outputs.
+</Tip>
+
 ## XLMRobertaConfig
 
 [[autodoc]] XLMRobertaConfig
@@ -126,6 +129,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] XLMRobertaTokenizerFast
 
+<frameworkcontent>
+<pt>
+
 ## XLMRobertaModel
 
 [[autodoc]] XLMRobertaModel
@@ -161,6 +167,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] XLMRobertaForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFXLMRobertaModel
 
 [[autodoc]] TFXLMRobertaModel
@@ -196,6 +205,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] TFXLMRobertaForQuestionAnswering
     - call
 
+</tf>
+<jax>
+
 ## FlaxXLMRobertaModel
 
 [[autodoc]] FlaxXLMRobertaModel
@@ -230,3 +242,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] FlaxXLMRobertaForQuestionAnswering
     - __call__
+
+</jax>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/en/model_doc/xlm-v.md b/docs/source/en/model_doc/xlm-v.md
index 38bed0dc46b..049a1f35ad9 100644
--- a/docs/source/en/model_doc/xlm-v.md
+++ b/docs/source/en/model_doc/xlm-v.md
@@ -35,7 +35,10 @@ a multilingual language model with a one million token vocabulary. XLM-V outperf
 tested on ranging from natural language inference (XNLI), question answering (MLQA, XQuAD, TyDiQA), and
 named entity recognition (WikiAnn) to low-resource tasks (Americas NLI, MasakhaNER).*
 
-Tips:
+This model was contributed by [stefan-it](https://huggingface.co/stefan-it), including detailed experiments with XLM-V on downstream tasks.
+The experiments repository can be found [here](https://github.com/stefan-it/xlm-v-experiments).
+
+## Usage tips
 
 - XLM-V is compatible with the XLM-RoBERTa model architecture, only model weights from [`fairseq`](https://github.com/facebookresearch/fairseq)
   library had to be converted.
@@ -43,5 +46,7 @@ Tips:
 
 A XLM-V (base size) model is available under the [`facebook/xlm-v-base`](https://huggingface.co/facebook/xlm-v-base) identifier.
 
-This model was contributed by [stefan-it](https://huggingface.co/stefan-it), including detailed experiments with XLM-V on downstream tasks.
-The experiments repository can be found [here](https://github.com/stefan-it/xlm-v-experiments).
+<Tip>
+
+XLM-V architecture is the same as XLM-RoBERTa, refer to [XLM-RoBERTa documentation](xlm-roberta) for API reference, and examples.
+</Tip>
\ No newline at end of file
diff --git a/docs/source/en/model_doc/xlm.md b/docs/source/en/model_doc/xlm.md
index 8b5b31a2dbe..0ee11c6addc 100644
--- a/docs/source/en/model_doc/xlm.md
+++ b/docs/source/en/model_doc/xlm.md
@@ -46,7 +46,9 @@ obtain 34.3 BLEU on WMT'16 German-English, improving the previous state of the a
 machine translation, we obtain a new state of the art of 38.5 BLEU on WMT'16 Romanian-English, outperforming the
 previous best approach by more than 4 BLEU. Our code and pretrained models will be made publicly available.*
 
-Tips:
+This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/facebookresearch/XLM/).
+
+## Usage tips
 
 - XLM has many different checkpoints, which were trained using different objectives: CLM, MLM or TLM. Make sure to
   select the correct objective for your task (e.g. MLM checkpoints are not suitable for generation).
@@ -57,9 +59,7 @@ Tips:
     * Masked language modeling (MLM) which is like RoBERTa. One of the languages is selected for each training sample, and the model input is a sentence of 256 tokens, that may span over several documents in one of those languages, with dynamic masking of the tokens.
     * A combination of MLM and translation language modeling (TLM). This consists of concatenating a sentence in two different languages, with random masking. To predict one of the masked tokens, the model can use both, the surrounding context in language 1 and the context given by language 2.
 
-This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/facebookresearch/XLM/).
-
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
@@ -84,6 +84,9 @@ This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The o
 
 [[autodoc]] models.xlm.modeling_xlm.XLMForQuestionAnsweringOutput
 
+<frameworkcontent>
+<pt>
+
 ## XLMModel
 
 [[autodoc]] XLMModel
@@ -119,6 +122,9 @@ This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The o
 [[autodoc]] XLMForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFXLMModel
 
 [[autodoc]] TFXLMModel
@@ -148,3 +154,8 @@ This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The o
 
 [[autodoc]] TFXLMForQuestionAnsweringSimple
     - call
+
+</tf>
+</frameworkcontent>
+
+
diff --git a/docs/source/en/model_doc/xlnet.md b/docs/source/en/model_doc/xlnet.md
index 3685728cd72..d2209c3d550 100644
--- a/docs/source/en/model_doc/xlnet.md
+++ b/docs/source/en/model_doc/xlnet.md
@@ -44,7 +44,9 @@ formulation. Furthermore, XLNet integrates ideas from Transformer-XL, the state-
 pretraining. Empirically, under comparable experiment settings, XLNet outperforms BERT on 20 tasks, often by a large
 margin, including question answering, natural language inference, sentiment analysis, and document ranking.*
 
-Tips:
+This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/zihangdai/xlnet/).
+
+## Usage tips
 
 - The specific attention pattern can be controlled at training and test time using the `perm_mask` input.
 - Due to the difficulty of training a fully auto-regressive model over various factorization order, XLNet is pretrained
@@ -56,9 +58,7 @@ Tips:
 - XLNet is not a traditional autoregressive model but uses a training strategy that builds on that. It permutes the tokens in the sentence, then allows the model to use the last n tokens to predict the token n+1. Since this is all done with a mask, the sentence is actually fed in the model in the right order, but instead of masking the first n tokens for n+1, XLNet uses a mask that hides the previous tokens in some given permutation of 1,…,sequence length.
 - XLNet also uses the same recurrence mechanism as Transformer-XL to build long-term dependencies.
 
-This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/zihangdai/xlnet/).
-
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
@@ -110,6 +110,9 @@ This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The o
 
 [[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForQuestionAnsweringSimpleOutput
 
+<frameworkcontent>
+<pt>
+
 ## XLNetModel
 
 [[autodoc]] XLNetModel
@@ -145,6 +148,9 @@ This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The o
 [[autodoc]] XLNetForQuestionAnswering
     - forward
 
+</pt>
+<tf>
+
 ## TFXLNetModel
 
 [[autodoc]] TFXLNetModel
@@ -174,3 +180,6 @@ This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The o
 
 [[autodoc]] TFXLNetForQuestionAnsweringSimple
     - call
+
+</tf>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/en/model_doc/xls_r.md b/docs/source/en/model_doc/xls_r.md
index 8e22004244c..2226c813e72 100644
--- a/docs/source/en/model_doc/xls_r.md
+++ b/docs/source/en/model_doc/xls_r.md
@@ -34,14 +34,18 @@ language identification. Moreover, we show that with sufficient model size, cros
 English-only pretraining when translating English speech into other languages, a setting which favors monolingual
 pretraining. We hope XLS-R can help to improve speech processing tasks for many more languages of the world.*
 
-Tips:
+Relevant checkpoints can be found under https://huggingface.co/models?other=xls_r.
+
+The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec).
+
+## Usage tips
 
 - XLS-R is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
 - XLS-R model was trained using connectionist temporal classification (CTC) so the model output has to be decoded using
   [`Wav2Vec2CTCTokenizer`].
 
-Relevant checkpoints can be found under https://huggingface.co/models?other=xls_r.
+<Tip>
 
-XLS-R's architecture is based on the Wav2Vec2 model, so one can refer to [Wav2Vec2's documentation page](wav2vec2).
+XLS-R's architecture is based on the Wav2Vec2 model, refer to [Wav2Vec2's documentation page](wav2vec2) for API reference.
 
-The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec).
+</Tip>
\ No newline at end of file
diff --git a/docs/source/en/model_doc/xlsr_wav2vec2.md b/docs/source/en/model_doc/xlsr_wav2vec2.md
index 643d37416d3..d1b5444c246 100644
--- a/docs/source/en/model_doc/xlsr_wav2vec2.md
+++ b/docs/source/en/model_doc/xlsr_wav2vec2.md
@@ -34,12 +34,16 @@ individual models. Analysis shows that the latent discrete speech representation
 increased sharing for related languages. We hope to catalyze research in low-resource speech understanding by releasing
 XLSR-53, a large model pretrained in 53 languages.*
 
-Tips:
+The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec).
+
+## Usage tips
 
 - XLSR-Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
 - XLSR-Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be
   decoded using [`Wav2Vec2CTCTokenizer`].
 
+<Tip>
+
 XLSR-Wav2Vec2's architecture is based on the Wav2Vec2 model, so one can refer to [Wav2Vec2's documentation page](wav2vec2).
 
-The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec).
+</Tip>
diff --git a/docs/source/en/model_doc/xmod.md b/docs/source/en/model_doc/xmod.md
index 5a3409bbc4c..47797fa6490 100644
--- a/docs/source/en/model_doc/xmod.md
+++ b/docs/source/en/model_doc/xmod.md
@@ -25,13 +25,15 @@ The abstract from the paper is the following:
 
 *Multilingual pre-trained models are known to suffer from the curse of multilinguality, which causes per-language performance to drop as they cover more languages. We address this issue by introducing language-specific modules, which allows us to grow the total capacity of the model, while keeping the total number of trainable parameters per language constant. In contrast with prior work that learns language-specific components post-hoc, we pre-train the modules of our Cross-lingual Modular (X-MOD) models from the start. Our experiments on natural language inference, named entity recognition and question answering show that our approach not only mitigates the negative interference between languages, but also enables positive transfer, resulting in improved monolingual and cross-lingual performance. Furthermore, our approach enables adding languages post-hoc with no measurable drop in performance, no longer limiting the model usage to the set of pre-trained languages.*
 
+This model was contributed by [jvamvas](https://huggingface.co/jvamvas).
+The original code can be found [here](https://github.com/facebookresearch/fairseq/tree/58cc6cca18f15e6d56e3f60c959fe4f878960a60/fairseq/models/xmod) and the original documentation is found [here](https://github.com/facebookresearch/fairseq/tree/58cc6cca18f15e6d56e3f60c959fe4f878960a60/examples/xmod).
+
+## Usage tips
+
 Tips:
 - X-MOD is similar to [XLM-R](xlm-roberta), but a difference is that the input language needs to be specified so that the correct language adapter can be activated.
 - The main models – base and large – have adapters for 81 languages.
 
-This model was contributed by [jvamvas](https://huggingface.co/jvamvas).
-The original code can be found [here](https://github.com/facebookresearch/fairseq/tree/58cc6cca18f15e6d56e3f60c959fe4f878960a60/fairseq/models/xmod) and the original documentation is found [here](https://github.com/facebookresearch/fairseq/tree/58cc6cca18f15e6d56e3f60c959fe4f878960a60/examples/xmod).
-
 ## Adapter Usage
 
 ### Input language
diff --git a/docs/source/en/model_doc/yolos.md b/docs/source/en/model_doc/yolos.md
index 6185c3a0675..5386c373ac8 100644
--- a/docs/source/en/model_doc/yolos.md
+++ b/docs/source/en/model_doc/yolos.md
@@ -25,10 +25,6 @@ The abstract from the paper is the following:
 
 *Can Transformer perform 2D object- and region-level recognition from a pure sequence-to-sequence perspective with minimal knowledge about the 2D spatial structure? To answer this question, we present You Only Look at One Sequence (YOLOS), a series of object detection models based on the vanilla Vision Transformer with the fewest possible modifications, region priors, as well as inductive biases of the target task. We find that YOLOS pre-trained on the mid-sized ImageNet-1k dataset only can already achieve quite competitive performance on the challenging COCO object detection benchmark, e.g., YOLOS-Base directly adopted from BERT-Base architecture can obtain 42.0 box AP on COCO val. We also discuss the impacts as well as limitations of current pre-train schemes and model scaling strategies for Transformer in vision through YOLOS.*
 
-Tips:
-
-- One can use [`YolosImageProcessor`] for preparing images (and optional targets) for the model. Contrary to [DETR](detr), YOLOS doesn't require a `pixel_mask` to be created.
-
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/yolos_architecture.png"
 alt="drawing" width="600"/>
 
@@ -47,6 +43,12 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
 
+<Tip>
+
+Use [`YolosImageProcessor`] for preparing images (and optional targets) for the model. Contrary to [DETR](detr), YOLOS doesn't require a `pixel_mask` to be created.
+
+</Tip>
+
 ## YolosConfig
 
 [[autodoc]] YolosConfig
@@ -65,13 +67,11 @@ If you're interested in submitting a resource to be included here, please feel f
     - pad
     - post_process_object_detection
 
-
 ## YolosModel
 
 [[autodoc]] YolosModel
     - forward
 
-
 ## YolosForObjectDetection
 
 [[autodoc]] YolosForObjectDetection
diff --git a/docs/source/en/model_doc/yoso.md b/docs/source/en/model_doc/yoso.md
index 4b98cd348c9..a3dfa3fed85 100644
--- a/docs/source/en/model_doc/yoso.md
+++ b/docs/source/en/model_doc/yoso.md
@@ -37,7 +37,9 @@ length where we see favorable performance relative to a standard pretrained Tran
 for evaluating performance on long sequences, our method achieves results consistent with softmax self-attention but with sizable 
 speed-ups and memory savings and often outperforms other efficient self-attention methods. Our code is available at this https URL*
 
-Tips:
+This model was contributed by [novice03](https://huggingface.co/novice03). The original code can be found [here](https://github.com/mlpen/YOSO).
+
+## Usage tips
 
 - The YOSO attention algorithm is implemented through custom CUDA kernels, functions written in CUDA C++ that can be executed multiple times
 in parallel on a GPU.
@@ -52,9 +54,7 @@ alt="drawing" width="600"/>
 
 <small> YOSO Attention Algorithm. Taken from the <a href="https://arxiv.org/abs/2111.09714">original paper</a>.</small>
 
-This model was contributed by [novice03](https://huggingface.co/novice03). The original code can be found [here](https://github.com/mlpen/YOSO).
-
-## Documentation resources
+## Resources
 
 - [Text classification task guide](../tasks/sequence_classification)
 - [Token classification task guide](../tasks/token_classification)
@@ -66,19 +66,16 @@ This model was contributed by [novice03](https://huggingface.co/novice03). The o
 
 [[autodoc]] YosoConfig
 
-
 ## YosoModel
 
 [[autodoc]] YosoModel
     - forward
 
-
 ## YosoForMaskedLM
 
 [[autodoc]] YosoForMaskedLM
     - forward
 
-
 ## YosoForSequenceClassification
 
 [[autodoc]] YosoForSequenceClassification
@@ -89,13 +86,11 @@ This model was contributed by [novice03](https://huggingface.co/novice03). The o
 [[autodoc]] YosoForMultipleChoice
     - forward
 
-
 ## YosoForTokenClassification
 
 [[autodoc]] YosoForTokenClassification
     - forward
 
-
 ## YosoForQuestionAnswering
 
 [[autodoc]] YosoForQuestionAnswering