diff --git a/.circleci/config.yml b/.circleci/config.yml
index 93b9e675f16..a393837806e 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -139,7 +139,7 @@ jobs:
                       - v0.4-{{ checksum "setup.py" }}
             - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
             - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing,sentencepiece,speech,vision]
+            - run: pip install .[sklearn,torch,testing,sentencepiece,speech,vision,timm]
             - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
             - save_cache:
                   key: v0.4-torch-{{ checksum "setup.py" }}
diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml
index 439822e068c..06618d08e8d 100644
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -37,7 +37,7 @@ jobs:
         run: |
           apt -y update && apt install -y libsndfile1-dev
           pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
+          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm]
 
       - name: Are GPUs recognized by our DL frameworks
         run: |
@@ -121,7 +121,7 @@ jobs:
         run: |
           apt -y update && apt install -y libsndfile1-dev
           pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
+          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm]
 
       - name: Are GPUs recognized by our DL frameworks
         run: |
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index b6a3d65bee1..f98215a62c4 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -33,7 +33,7 @@ jobs:
         run: |
           apt -y update && apt install -y libsndfile1-dev
           pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,integrations]
+          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm]
 
       - name: Are GPUs recognized by our DL frameworks
         run: |
@@ -155,7 +155,7 @@ jobs:
         run: |
           apt -y update && apt install -y libsndfile1-dev
           pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,integrations]
+          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm]
 
       - name: Are GPUs recognized by our DL frameworks
         run: |
diff --git a/README.md b/README.md
index 37e1ee96433..7bd67b49def 100644
--- a/README.md
+++ b/README.md
@@ -215,6 +215,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[DeBERTa](https://huggingface.co/transformers/model_doc/deberta.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/transformers/model_doc/deberta_v2.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeiT](https://huggingface.co/transformers/model_doc/deit.html)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+1. **[DETR](https://huggingface.co/transformers/model_doc/detr.html)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
 1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval
diff --git a/docs/source/community.md b/docs/source/community.md
index 38affbf1e68..206d8b4cf58 100644
--- a/docs/source/community.md
+++ b/docs/source/community.md
@@ -59,3 +59,5 @@ This page regroups resources around 🤗 Transformers developed by the community
 | [Evaluate LUKE on CoNLL-2003, an important NER benchmark](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | How to evaluate *LukeForEntitySpanClassification* on the CoNLL-2003 dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) |
 | [Evaluate BigBird-Pegasus on PubMed dataset](https://github.com/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | How to evaluate *BigBirdPegasusForConditionalGeneration* on PubMed dataset | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) |
 | [Speech Emotion Classification with Wav2Vec2](https://github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | How to leverage a pretrained Wav2Vec2 model for Emotion Classification on the MEGA dataset | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) |
+| [Detect objects in an image with DETR](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | How to use a trained *DetrForObjectDetection* model to detect objects in an image and visualize attention | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) |
+| [Fine-tune DETR on a custom object detection dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | How to fine-tune *DetrForObjectDetection* on a custom object detection dataset | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) |
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 5f51bf819a1..9e2d093eb8a 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -153,128 +153,131 @@ Supported models
 19. :doc:`DeiT <model_doc/deit>` (from Facebook) released with the paper `Training data-efficient image transformers &
     distillation through attention <https://arxiv.org/abs/2012.12877>`__ by Hugo Touvron, Matthieu Cord, Matthijs
     Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-20. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
+20. :doc:`DETR <model_doc/detr>` (from Facebook) released with the paper `End-to-End Object Detection with Transformers
+    <https://arxiv.org/abs/2005.12872>`__ by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier,
+    Alexander Kirillov, Sergey Zagoruyko.
+21. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
     Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`__ by Yizhe
     Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-21. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
+22. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
     distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__ by Victor
     Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2
     <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, RoBERTa into `DistilRoBERTa
     <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, Multilingual BERT into
     `DistilmBERT <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__ and a German
     version of DistilBERT.
-22. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
+23. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
     Question Answering <https://arxiv.org/abs/2004.04906>`__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick
     Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-23. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
+24. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
     Pre-training text encoders as discriminators rather than generators <https://arxiv.org/abs/2003.10555>`__ by Kevin
     Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-24. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
+25. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
     Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne,
     Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-25. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
+26. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
     Filtering out Sequential Redundancy for Efficient Language Processing <https://arxiv.org/abs/2006.03236>`__ by
     Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-26. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
+27. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
     Pre-Training <https://blog.openai.com/language-unsupervised/>`__ by Alec Radford, Karthik Narasimhan, Tim Salimans
     and Ilya Sutskever.
-27. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
+28. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
     Learners <https://blog.openai.com/better-language-models/>`__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David
     Luan, Dario Amodei** and Ilya Sutskever**.
-28. :doc:`GPT Neo <model_doc/gpt_neo>` (from EleutherAI) released in the repository `EleutherAI/gpt-neo
+29. :doc:`GPT Neo <model_doc/gpt_neo>` (from EleutherAI) released in the repository `EleutherAI/gpt-neo
     <https://github.com/EleutherAI/gpt-neo>`__ by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-29. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
+30. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
     <https://arxiv.org/abs/2101.01321>`__ by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
-30. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
+31. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
     of Text and Layout for Document Image Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li,
     Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-31. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
+32. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
     <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-32. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
+33. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
     Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-33. :doc:`LUKE <model_doc/luke>` (from Studio Ousia) released with the paper `LUKE: Deep Contextualized Entity
+34. :doc:`LUKE <model_doc/luke>` (from Studio Ousia) released with the paper `LUKE: Deep Contextualized Entity
     Representations with Entity-aware Self-attention <https://arxiv.org/abs/2010.01057>`__ by Ikuya Yamada, Akari Asai,
     Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-34. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
+35. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
     Encoder Representations from Transformers for Open-Domain Question Answering <https://arxiv.org/abs/1908.07490>`__
     by Hao Tan and Mohit Bansal.
-35. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
+36. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
     Machine Translation <https://arxiv.org/abs/2010.11125>`__ by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi
     Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman
     Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-36. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
+37. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
     Jörg Tiedemann. The `Marian Framework <https://marian-nmt.github.io/>`__ is being developed by the Microsoft
     Translator Team.
-37. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
+38. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
     Neural Machine Translation <https://arxiv.org/abs/2001.08210>`__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li,
     Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-38. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
+39. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
     Multilingual Pretraining and Finetuning <https://arxiv.org/abs/2008.00401>`__ by Yuqing Tang, Chau Tran, Xian Li,
     Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-39. :doc:`Megatron-BERT <model_doc/megatron_bert>` (from NVIDIA) released with the paper `Megatron-LM: Training
+40. :doc:`Megatron-BERT <model_doc/megatron_bert>` (from NVIDIA) released with the paper `Megatron-LM: Training
     Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
     Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-40. :doc:`Megatron-GPT2 <model_doc/megatron_gpt2>` (from NVIDIA) released with the paper `Megatron-LM: Training
+41. :doc:`Megatron-GPT2 <model_doc/megatron_gpt2>` (from NVIDIA) released with the paper `Megatron-LM: Training
     Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
     Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-41. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
+42. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
     Pre-training for Language Understanding <https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin,
     Jianfeng Lu, Tie-Yan Liu.
-42. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
+43. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
     text-to-text transformer <https://arxiv.org/abs/2010.11934>`__ by Linting Xue, Noah Constant, Adam Roberts, Mihir
     Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-43. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
+44. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
     Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__> by Jingqing Zhang, Yao Zhao,
     Mohammad Saleh and Peter J. Liu.
-44. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
+45. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
     Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi,
     Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-45. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
+46. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
     Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-46. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
+47. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
     Pretraining Approach <https://arxiv.org/abs/1907.11692>`__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
     Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-47. :doc:`RoFormer <model_doc/roformer>` (from ZhuiyiTechnology), released together with the paper a `RoFormer:
+48. :doc:`RoFormer <model_doc/roformer>` (from ZhuiyiTechnology), released together with the paper a `RoFormer:
     Enhanced Transformer with Rotary Position Embedding <https://arxiv.org/pdf/2104.09864v1.pdf>`__ by Jianlin Su and
     Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-48. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
+49. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
     `fairseq S2T: Fast Speech-to-Text Modeling with fairseq <https://arxiv.org/abs/2010.05171>`__ by Changhan Wang, Yun
     Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-49. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
+50. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
     about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola, Albert E. Shaw, Ravi
     Krishna, and Kurt W. Keutzer.
-50. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
+51. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
     Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
     Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-51. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
+52. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
     Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
     Francesco Piccinno and Julian Martin Eisenschlos.
-52. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
+53. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
     Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
     Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-53. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
+54. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
     Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`__ by Alexey Dosovitskiy,
     Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias
     Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-54. :doc:`VisualBERT <model_doc/visual_bert>` (from UCLA NLP) released with the paper `VisualBERT: A Simple and
+55. :doc:`VisualBERT <model_doc/visual_bert>` (from UCLA NLP) released with the paper `VisualBERT: A Simple and
     Performant Baseline for Vision and Language <https://arxiv.org/pdf/1908.03557>`__ by Liunian Harold Li, Mark
     Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-55. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
+56. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
     Self-Supervised Learning of Speech Representations <https://arxiv.org/abs/2006.11477>`__ by Alexei Baevski, Henry
     Zhou, Abdelrahman Mohamed, Michael Auli.
-56. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
+57. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
     Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
-57. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
+58. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
     Predicting Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan,
     Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-58. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
+59. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
     Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau*, Kartikay
     Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
     Zettlemoyer and Veselin Stoyanov.
-59. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive
+60. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive
     Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang*, Zihang Dai*, Yiming
     Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-60. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
+61. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
     Cross-Lingual Representation Learning For Speech Recognition <https://arxiv.org/abs/2006.13979>`__ by Alexis
     Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
 
@@ -318,6 +321,8 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           DeBERTa           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
@@ -502,6 +507,7 @@ Flax), PyTorch, and/or TensorFlow.
     model_doc/deberta
     model_doc/deberta_v2
     model_doc/deit
+    model_doc/detr
     model_doc/dialogpt
     model_doc/distilbert
     model_doc/dpr
diff --git a/docs/source/model_doc/detr.rst b/docs/source/model_doc/detr.rst
new file mode 100644
index 00000000000..dbd1fb99aad
--- /dev/null
+++ b/docs/source/model_doc/detr.rst
@@ -0,0 +1,202 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+DETR
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The DETR model was proposed in `End-to-End Object Detection with Transformers <https://arxiv.org/abs/2005.12872>`__ by
+Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov and Sergey Zagoruyko. DETR
+consists of a convolutional backbone followed by an encoder-decoder Transformer which can be trained end-to-end for
+object detection. It greatly simplifies a lot of the complexity of models like Faster-R-CNN and Mask-R-CNN, which use
+things like region proposals, non-maximum suppression procedure and anchor generation. Moreover, DETR can also be
+naturally extended to perform panoptic segmentation, by simply adding a mask head on top of the decoder outputs.
+
+The abstract from the paper is the following:
+
+*We present a new method that views object detection as a direct set prediction problem. Our approach streamlines the
+detection pipeline, effectively removing the need for many hand-designed components like a non-maximum suppression
+procedure or anchor generation that explicitly encode our prior knowledge about the task. The main ingredients of the
+new framework, called DEtection TRansformer or DETR, are a set-based global loss that forces unique predictions via
+bipartite matching, and a transformer encoder-decoder architecture. Given a fixed small set of learned object queries,
+DETR reasons about the relations of the objects and the global image context to directly output the final set of
+predictions in parallel. The new model is conceptually simple and does not require a specialized library, unlike many
+other modern detectors. DETR demonstrates accuracy and run-time performance on par with the well-established and
+highly-optimized Faster RCNN baseline on the challenging COCO object detection dataset. Moreover, DETR can be easily
+generalized to produce panoptic segmentation in a unified manner. We show that it significantly outperforms competitive
+baselines.*
+
+This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code can be found `here
+<https://github.com/facebookresearch/detr>`__.
+
+Here's a TLDR explaining how :class:`~transformers.DetrForObjectDetection` works:
+
+First, an image is sent through a pre-trained convolutional backbone (in the paper, the authors use
+ResNet-50/ResNet-101). Let's assume we also add a batch dimension. This means that the input to the backbone is a
+tensor of shape :obj:`(batch_size, 3, height, width)`, assuming the image has 3 color channels (RGB). The CNN backbone
+outputs a new lower-resolution feature map, typically of shape :obj:`(batch_size, 2048, height/32, width/32)`. This is
+then projected to match the hidden dimension of the Transformer of DETR, which is :obj:`256` by default, using a
+:obj:`nn.Conv2D` layer. So now, we have a tensor of shape :obj:`(batch_size, 256, height/32, width/32).` Next, the
+feature map is flattened and transposed to obtain a tensor of shape :obj:`(batch_size, seq_len, d_model)` =
+:obj:`(batch_size, width/32*height/32, 256)`. So a difference with NLP models is that the sequence length is actually
+longer than usual, but with a smaller :obj:`d_model` (which in NLP is typically 768 or higher).
+
+Next, this is sent through the encoder, outputting :obj:`encoder_hidden_states` of the same shape (you can consider
+these as image features). Next, so-called **object queries** are sent through the decoder. This is a tensor of shape
+:obj:`(batch_size, num_queries, d_model)`, with :obj:`num_queries` typically set to 100 and initialized with zeros.
+These input embeddings are learnt positional encodings that the authors refer to as object queries, and similarly to
+the encoder, they are added to the input of each attention layer. Each object query will look for a particular object
+in the image. The decoder updates these embeddings through multiple self-attention and encoder-decoder attention layers
+to output :obj:`decoder_hidden_states` of the same shape: :obj:`(batch_size, num_queries, d_model)`. Next, two heads
+are added on top for object detection: a linear layer for classifying each object query into one of the objects or "no
+object", and a MLP to predict bounding boxes for each query.
+
+The model is trained using a **bipartite matching loss**: so what we actually do is compare the predicted classes +
+bounding boxes of each of the N = 100 object queries to the ground truth annotations, padded up to the same length N
+(so if an image only contains 4 objects, 96 annotations will just have a "no object" as class and "no bounding box" as
+bounding box). The `Hungarian matching algorithm <https://en.wikipedia.org/wiki/Hungarian_algorithm>`__ is used to find
+an optimal one-to-one mapping of each of the N queries to each of the N annotations. Next, standard cross-entropy (for
+the classes) and a linear combination of the L1 and `generalized IoU loss <https://giou.stanford.edu/>`__ (for the
+bounding boxes) are used to optimize the parameters of the model.
+
+DETR can be naturally extended to perform panoptic segmentation (which unifies semantic segmentation and instance
+segmentation). :class:`~transformers.DetrForSegmentation` adds a segmentation mask head on top of
+:class:`~transformers.DetrForObjectDetection`. The mask head can be trained either jointly, or in a two steps process,
+where one first trains a :class:`~transformers.DetrForObjectDetection` model to detect bounding boxes around both
+"things" (instances) and "stuff" (background things like trees, roads, sky), then freeze all the weights and train only
+the mask head for 25 epochs. Experimentally, these two approaches give similar results. Note that predicting boxes is
+required for the training to be possible, since the Hungarian matching is computed using distances between boxes.
+
+Tips:
+
+- DETR uses so-called **object queries** to detect objects in an image. The number of queries determines the maximum
+  number of objects that can be detected in a single image, and is set to 100 by default (see parameter
+  :obj:`num_queries` of :class:`~transformers.DetrConfig`). Note that it's good to have some slack (in COCO, the
+  authors used 100, while the maximum number of objects in a COCO image is ~70).
+- The decoder of DETR updates the query embeddings in parallel. This is different from language models like GPT-2,
+  which use autoregressive decoding instead of parallel. Hence, no causal attention mask is used.
+- DETR adds position embeddings to the hidden states at each self-attention and cross-attention layer before projecting
+  to queries and keys. For the position embeddings of the image, one can choose between fixed sinusoidal or learned
+  absolute position embeddings. By default, the parameter :obj:`position_embedding_type` of
+  :class:`~transformers.DetrConfig` is set to :obj:`"sine"`.
+- During training, the authors of DETR did find it helpful to use auxiliary losses in the decoder, especially to help
+  the model output the correct number of objects of each class. If you set the parameter :obj:`auxiliary_loss` of
+  :class:`~transformers.DetrConfig` to :obj:`True`, then prediction feedforward neural networks and Hungarian losses
+  are added after each decoder layer (with the FFNs sharing parameters).
+- If you want to train the model in a distributed environment across multiple nodes, then one should update the
+  `num_boxes` variable in the `DetrLoss` class of `modeling_detr.py`. When training on multiple nodes, this should be
+  set to the average number of target boxes across all nodes, as can be seen in the original implementation `here
+  <https://github.com/facebookresearch/detr/blob/a54b77800eb8e64e3ad0d8237789fcbf2f8350c5/models/detr.py#L227-L232>`__.
+- :class:`~transformers.DetrForObjectDetection` and :class:`~transformers.DetrForSegmentation` can be initialized with
+  any convolutional backbone available in the `timm library <https://github.com/rwightman/pytorch-image-models>`__.
+  Initializing with a MobileNet backbone for example can be done by setting the :obj:`backbone` attribute of
+  :class:`~transformers.DetrConfig` to :obj:`"tf_mobilenetv3_small_075"`, and then initializing the model with that
+  config.
+- DETR resizes the input images such that the shortest side is at least a certain amount of pixels while the longest is
+  at most 1333 pixels. At training time, scale augmentation is used such that the shortest side is randomly set to at
+  least 480 and at most 800 pixels. At inference time, the shortest side is set to 800. One can use
+  :class:`~transformers.DetrFeatureExtractor` to prepare images (and optional annotations in COCO format) for the
+  model. Due to this resizing, images in a batch can have different sizes. DETR solves this by padding images up to the
+  largest size in a batch, and by creating a pixel mask that indicates which pixels are real/which are padding.
+  Alternatively, one can also define a custom :obj:`collate_fn` in order to batch images together, using
+  :meth:`~transformers.DetrFeatureExtractor.pad_and_create_pixel_mask`.
+- The size of the images will determine the amount of memory being used, and will thus determine the :obj:`batch_size`.
+  It is advised to use a batch size of 2 per GPU. See `this Github thread
+  <https://github.com/facebookresearch/detr/issues/150>`__ for more info.
+
+As a summary, consider the following table:
+
++---------------------------------------------+---------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------------+
+| **Task**                                    | **Object detection**                                    | **Instance segmentation**                                            | **Panoptic segmentation**                                              |
++---------------------------------------------+---------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------------+
+| **Description**                             | Predicting bounding boxes and class labels around       | Predicting masks around objects (i.e. instances) in an image         | Predicting masks around both objects (i.e. instances) as well as       |
+|                                             | objects in an image                                     |                                                                      | "stuff" (i.e. background things like trees and roads) in an image      |
++---------------------------------------------+---------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------------+
+| **Model**                                   | :class:`~transformers.DetrForObjectDetection`           | :class:`~transformers.DetrForSegmentation`                           | :class:`~transformers.DetrForSegmentation`                             |
++---------------------------------------------+---------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------------+
+| **Example dataset**                         | COCO detection                                          | COCO detection,                                                      | COCO panoptic                                                          |
+|                                             |                                                         | COCO panoptic                                                        |                                                                        |
++---------------------------------------------+---------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------------+
+| **Format of annotations to provide to**     | {‘image_id’: int,                                       | {‘image_id’: int,                                                    | {‘file_name: str,                                                      |
+| :class:`~transformers.DetrFeatureExtractor` | ‘annotations’: List[Dict]}, each Dict being a COCO      | ‘annotations’: [List[Dict]] } (in case of COCO detection)            | ‘image_id: int,                                                        |
+|                                             | object annotation (containing keys "image_id",          |                                                                      | ‘segments_info’: List[Dict] }                                          |
+|                                             |                                                         | or                                                                   |                                                                        |
+|                                             |                                                         |                                                                      | and masks_path (path to directory containing PNG files of the masks)   |
+|                                             |                                                         | {‘file_name’: str,                                                   |                                                                        |
+|                                             |                                                         | ‘image_id’: int,                                                     |                                                                        |
+|                                             |                                                         | ‘segments_info’: List[Dict]} (in case of COCO panoptic)              |                                                                        |
++---------------------------------------------+---------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------------+
+| **Postprocessing** (i.e. converting the     | :meth:`~transformers.DetrFeatureExtractor.post_process` | :meth:`~transformers.DetrFeatureExtractor.post_process_segmentation` | :meth:`~transformers.DetrFeatureExtractor.post_process_segmentation`,  |
+| output of the model to COCO API)            |                                                         |                                                                      | :meth:`~transformers.DetrFeatureExtractor.post_process_panoptic`       |
++---------------------------------------------+---------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------------+
+| **evaluators**                              | :obj:`CocoEvaluator` with iou_types = “bbox”            | :obj:`CocoEvaluator` with iou_types = “bbox”, “segm”                 | :obj:`CocoEvaluator` with iou_tupes = “bbox, “segm”                    |
+|                                             |                                                         |                                                                      |                                                                        |
+|                                             |                                                         |                                                                      | :obj:`PanopticEvaluator`                                               |
++---------------------------------------------+---------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------------+
+
+In short, one should prepare the data either in COCO detection or COCO panoptic format, then use
+:class:`~transformers.DetrFeatureExtractor` to create :obj:`pixel_values`, :obj:`pixel_mask` and optional
+:obj:`labels`, which can then be used to train (or fine-tune) a model. For evaluation, one should first convert the
+outputs of the model using one of the postprocessing methods of :class:`~transformers.DetrFeatureExtractor`. These can
+be be provided to either :obj:`CocoEvaluator` or :obj:`PanopticEvaluator`, which allow you to calculate metrics like
+mean Average Precision (mAP) and Panoptic Quality (PQ). The latter objects are implemented in the `original repository
+<https://github.com/facebookresearch/detr>`__. See the example notebooks for more info regarding evaluation.
+
+
+DETR specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.detr.modeling_detr.DetrModelOutput
+    :members:
+
+.. autoclass:: transformers.models.detr.modeling_detr.DetrObjectDetectionOutput
+    :members:
+
+.. autoclass:: transformers.models.detr.modeling_detr.DetrSegmentationOutput
+    :members:
+
+
+DetrConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DetrConfig
+    :members:
+
+
+DetrFeatureExtractor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DetrFeatureExtractor
+    :members: __call__, pad_and_create_pixel_mask, post_process, post_process_segmentation, post_process_panoptic
+
+
+DetrModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DetrModel
+    :members: forward
+
+
+DetrForObjectDetection
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DetrForObjectDetection
+    :members: forward
+
+
+DetrForSegmentation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DetrForSegmentation
+    :members: forward
diff --git a/setup.py b/setup.py
index b8ed916b0e0..32e4608102e 100644
--- a/setup.py
+++ b/setup.py
@@ -142,6 +142,7 @@ _deps = [
     "tensorflow-cpu>=2.3",
     "tensorflow>=2.3",
     "timeout-decorator",
+    "timm",
     "tokenizers>=0.10.1,<0.11",
     "torch>=1.0",
     "torchaudio",
@@ -249,6 +250,7 @@ extras["integrations"] = extras["optuna"] + extras["ray"]
 extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
 extras["speech"] = deps_list("soundfile", "torchaudio")
 extras["vision"] = deps_list("Pillow")
+extras["timm"] = deps_list("timm")
 
 extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
 extras["testing"] = (
@@ -270,6 +272,7 @@ extras["all"] = (
     + extras["speech"]
     + extras["vision"]
     + extras["integrations"]
+    + extras["timm"]
 )
 
 extras["docs_specific"] = deps_list(
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index c699983ac3d..386d64892cc 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -47,6 +47,7 @@ from .file_utils import (
     is_sentencepiece_available,
     is_speech_available,
     is_tf_available,
+    is_timm_available,
     is_tokenizers_available,
     is_torch_available,
     is_vision_available,
@@ -101,10 +102,12 @@ _import_structure = {
         "is_flax_available",
         "is_psutil_available",
         "is_py3nvml_available",
+        "is_scipy_available",
         "is_sentencepiece_available",
         "is_sklearn_available",
         "is_speech_available",
         "is_tf_available",
+        "is_timm_available",
         "is_tokenizers_available",
         "is_torch_available",
         "is_torch_tpu_available",
@@ -180,6 +183,7 @@ _import_structure = {
     "models.deberta": ["DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaConfig", "DebertaTokenizer"],
     "models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"],
     "models.deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig"],
+    "models.detr": ["DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DetrConfig"],
     "models.distilbert": ["DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DistilBertConfig", "DistilBertTokenizer"],
     "models.dpr": [
         "DPR_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -405,6 +409,7 @@ if is_vision_available():
     _import_structure["models.clip"].append("CLIPFeatureExtractor")
     _import_structure["models.clip"].append("CLIPProcessor")
     _import_structure["models.deit"].append("DeiTFeatureExtractor")
+    _import_structure["models.detr"].append("DetrFeatureExtractor")
     _import_structure["models.vit"].append("ViTFeatureExtractor")
 else:
     from .utils import dummy_vision_objects
@@ -413,6 +418,23 @@ else:
         name for name in dir(dummy_vision_objects) if not name.startswith("_")
     ]
 
+# Timm-backed objects
+if is_timm_available() and is_vision_available():
+    _import_structure["models.detr"].extend(
+        [
+            "DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "DetrForObjectDetection",
+            "DetrForSegmentation",
+            "DetrModel",
+        ]
+    )
+else:
+    from .utils import dummy_timm_objects
+
+    _import_structure["utils.dummy_timm_objects"] = [
+        name for name in dir(dummy_timm_objects) if not name.startswith("_")
+    ]
+
 # PyTorch-backed objects
 if is_torch_available():
     _import_structure["benchmark.benchmark"] = ["PyTorchBenchmark"]
@@ -489,6 +511,7 @@ if is_torch_available():
             "MODEL_FOR_MASKED_LM_MAPPING",
             "MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
             "MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
+            "MODEL_FOR_OBJECT_DETECTION_MAPPING",
             "MODEL_FOR_PRETRAINING_MAPPING",
             "MODEL_FOR_QUESTION_ANSWERING_MAPPING",
             "MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
@@ -1587,10 +1610,12 @@ if TYPE_CHECKING:
         is_flax_available,
         is_psutil_available,
         is_py3nvml_available,
+        is_scipy_available,
         is_sentencepiece_available,
         is_sklearn_available,
         is_speech_available,
         is_tf_available,
+        is_timm_available,
         is_tokenizers_available,
         is_torch_available,
         is_torch_tpu_available,
@@ -1666,6 +1691,7 @@ if TYPE_CHECKING:
     from .models.deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig, DebertaTokenizer
     from .models.deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config
     from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig
+    from .models.detr import DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DetrConfig
     from .models.distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig, DistilBertTokenizer
     from .models.dpr import (
         DPR_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -1863,13 +1889,23 @@ if TYPE_CHECKING:
         from .image_utils import ImageFeatureExtractionMixin
         from .models.clip import CLIPFeatureExtractor, CLIPProcessor
         from .models.deit import DeiTFeatureExtractor
+        from .models.detr import DetrFeatureExtractor
         from .models.vit import ViTFeatureExtractor
     else:
         from .utils.dummy_vision_objects import *
 
     # Modeling
-    if is_torch_available():
+    if is_timm_available() and is_vision_available():
+        from .models.detr import (
+            DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
+            DetrForObjectDetection,
+            DetrForSegmentation,
+            DetrModel,
+        )
+    else:
+        from .utils.dummy_timm_objects import *
 
+    if is_torch_available():
         # Benchmarks
         from .benchmark.benchmark import PyTorchBenchmark
         from .benchmark.benchmark_args import PyTorchBenchmarkArguments
@@ -1939,6 +1975,7 @@ if TYPE_CHECKING:
             MODEL_FOR_MASKED_LM_MAPPING,
             MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
             MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
+            MODEL_FOR_OBJECT_DETECTION_MAPPING,
             MODEL_FOR_PRETRAINING_MAPPING,
             MODEL_FOR_QUESTION_ANSWERING_MAPPING,
             MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index ec055db25bd..0f4c9573991 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -59,6 +59,7 @@ deps = {
     "tensorflow-cpu": "tensorflow-cpu>=2.3",
     "tensorflow": "tensorflow>=2.3",
     "timeout-decorator": "timeout-decorator",
+    "timm": "timm",
     "tokenizers": "tokenizers>=0.10.1,<0.11",
     "torch": "torch>=1.0",
     "torchaudio": "torchaudio",
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 9a55fe18edc..51daa86cb34 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -174,6 +174,14 @@ except importlib_metadata.PackageNotFoundError:
     _soundfile_available = False
 
 
+_timm_available = importlib.util.find_spec("timm") is not None
+try:
+    _timm_version = importlib_metadata.version("timm")
+    logger.debug(f"Successfully imported timm version {_timm_version}")
+except importlib_metadata.PackageNotFoundError:
+    _timm_available = False
+
+
 _torchaudio_available = importlib.util.find_spec("torchaudio") is not None
 try:
     _torchaudio_version = importlib_metadata.version("torchaudio")
@@ -317,12 +325,14 @@ def is_faiss_available():
     return _faiss_available
 
 
+def is_scipy_available():
+    return importlib.util.find_spec("scipy") is not None
+
+
 def is_sklearn_available():
     if importlib.util.find_spec("sklearn") is None:
         return False
-    if importlib.util.find_spec("scipy") is None:
-        return False
-    return importlib.util.find_spec("sklearn.metrics") and importlib.util.find_spec("scipy.stats")
+    return is_scipy_available() and importlib.util.find_spec("sklearn.metrics")
 
 
 def is_sentencepiece_available():
@@ -411,6 +421,10 @@ def is_soundfile_availble():
     return _soundfile_available
 
 
+def is_timm_available():
+    return _timm_available
+
+
 def is_torchaudio_available():
     return _torchaudio_available
 
@@ -536,12 +550,24 @@ explained here: https://pandas.pydata.org/pandas-docs/stable/getting_started/ins
 """
 
 
+# docstyle-ignore
+SCIPY_IMPORT_ERROR = """
+{0} requires the scipy library but it was not found in your environment. You can install it with pip:
+`pip install scipy`
+"""
+
+
 # docstyle-ignore
 SPEECH_IMPORT_ERROR = """
 {0} requires the torchaudio library but it was not found in your environment. You can install it with pip:
 `pip install torchaudio`
 """
 
+# docstyle-ignore
+TIMM_IMPORT_ERROR = """
+{0} requires the timm library but it was not found in your environment. You can install it with pip:
+`pip install timm`
+"""
 
 # docstyle-ignore
 VISION_IMPORT_ERROR = """
@@ -562,9 +588,11 @@ BACKENDS_MAPPING = OrderedDict(
         ("sklearn", (is_sklearn_available, SKLEARN_IMPORT_ERROR)),
         ("speech", (is_speech_available, SPEECH_IMPORT_ERROR)),
         ("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)),
+        ("timm", (is_timm_available, TIMM_IMPORT_ERROR)),
         ("tokenizers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)),
         ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
         ("vision", (is_vision_available, VISION_IMPORT_ERROR)),
+        ("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)),
     ]
 )
 
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 76075014535..f4e5c09f568 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -36,6 +36,7 @@ from . import (
     ctrl,
     deberta,
     deit,
+    detr,
     dialogpt,
     distilbert,
     dpr,
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index deb976d3415..a620c0a75dd 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -35,6 +35,7 @@ if is_torch_available():
         "MODEL_FOR_MASKED_LM_MAPPING",
         "MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
         "MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
+        "MODEL_FOR_OBJECT_DETECTION_MAPPING",
         "MODEL_FOR_PRETRAINING_MAPPING",
         "MODEL_FOR_QUESTION_ANSWERING_MAPPING",
         "MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
@@ -119,6 +120,7 @@ if TYPE_CHECKING:
             MODEL_FOR_MASKED_LM_MAPPING,
             MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
             MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
+            MODEL_FOR_OBJECT_DETECTION_MAPPING,
             MODEL_FOR_PRETRAINING_MAPPING,
             MODEL_FOR_QUESTION_ANSWERING_MAPPING,
             MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index e95d7cac12f..c103622d698 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -39,6 +39,7 @@ from ..ctrl.configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLCo
 from ..deberta.configuration_deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig
 from ..deberta_v2.configuration_deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config
 from ..deit.configuration_deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig
+from ..detr.configuration_detr import DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DetrConfig
 from ..distilbert.configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig
 from ..dpr.configuration_dpr import DPR_PRETRAINED_CONFIG_ARCHIVE_MAP, DPRConfig
 from ..electra.configuration_electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig
@@ -99,6 +100,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
         BIGBIRD_PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP,
         DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
         LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        DETR_PRETRAINED_CONFIG_ARCHIVE_MAP,
         GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP,
         BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP,
         MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -156,6 +158,7 @@ CONFIG_MAPPING = OrderedDict(
         ("bigbird_pegasus", BigBirdPegasusConfig),
         ("deit", DeiTConfig),
         ("luke", LukeConfig),
+        ("detr", DetrConfig),
         ("gpt_neo", GPTNeoConfig),
         ("big_bird", BigBirdConfig),
         ("speech_to_text", Speech2TextConfig),
@@ -219,6 +222,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
         ("bigbird_pegasus", "BigBirdPegasus"),
         ("deit", "DeiT"),
         ("luke", "LUKE"),
+        ("detr", "DETR"),
         ("gpt_neo", "GPT Neo"),
         ("big_bird", "BigBird"),
         ("speech_to_text", "Speech2Text"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 3cf3062f433..8b144b83c71 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -106,6 +106,7 @@ from ..deberta_v2.modeling_deberta_v2 import (
     DebertaV2Model,
 )
 from ..deit.modeling_deit import DeiTForImageClassification, DeiTForImageClassificationWithTeacher, DeiTModel
+from ..detr.modeling_detr import DetrForObjectDetection, DetrModel
 from ..distilbert.modeling_distilbert import (
     DistilBertForMaskedLM,
     DistilBertForMultipleChoice,
@@ -316,6 +317,7 @@ from .configuration_auto import (
     DebertaConfig,
     DebertaV2Config,
     DeiTConfig,
+    DetrConfig,
     DistilBertConfig,
     DPRConfig,
     ElectraConfig,
@@ -372,6 +374,7 @@ MODEL_MAPPING = OrderedDict(
         (BigBirdPegasusConfig, BigBirdPegasusModel),
         (DeiTConfig, DeiTModel),
         (LukeConfig, LukeModel),
+        (DetrConfig, DetrModel),
         (GPTNeoConfig, GPTNeoModel),
         (BigBirdConfig, BigBirdModel),
         (Speech2TextConfig, Speech2TextModel),
@@ -586,6 +589,13 @@ MODEL_FOR_MASKED_LM_MAPPING = OrderedDict(
     ]
 )
 
+MODEL_FOR_OBJECT_DETECTION_MAPPING = OrderedDict(
+    [
+        # Model for Object Detection mapping
+        (DetrConfig, DetrForObjectDetection),
+    ]
+)
+
 MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict(
     [
         # Model for Seq2Seq Causal LM mapping
diff --git a/src/transformers/models/detr/__init__.py b/src/transformers/models/detr/__init__.py
new file mode 100644
index 00000000000..b0dd3e2c674
--- /dev/null
+++ b/src/transformers/models/detr/__init__.py
@@ -0,0 +1,72 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...file_utils import _BaseLazyModule, is_timm_available, is_vision_available
+
+
+_import_structure = {
+    "configuration_detr": ["DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DetrConfig"],
+}
+
+if is_vision_available():
+    _import_structure["feature_extraction_detr"] = ["DetrFeatureExtractor"]
+
+if is_timm_available():
+    _import_structure["modeling_detr"] = [
+        "DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "DetrForObjectDetection",
+        "DetrForSegmentation",
+        "DetrModel",
+        "DetrPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_detr import DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DetrConfig
+
+    if is_vision_available():
+        from .feature_extraction_detr import DetrFeatureExtractor
+
+    if is_timm_available():
+        from .modeling_detr import (
+            DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
+            DetrForObjectDetection,
+            DetrForSegmentation,
+            DetrModel,
+            DetrPreTrainedModel,
+        )
+
+else:
+    import importlib
+    import os
+    import sys
+
+    class _LazyModule(_BaseLazyModule):
+        """
+        Module class that surfaces all objects but only performs associated imports when the objects are requested.
+        """
+
+        __file__ = globals()["__file__"]
+        __path__ = [os.path.dirname(__file__)]
+
+        def _get_module(self, module_name: str):
+            return importlib.import_module("." + module_name, self.__name__)
+
+    sys.modules[__name__] = _LazyModule(__name__, _import_structure)
diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py
new file mode 100644
index 00000000000..52625b1494c
--- /dev/null
+++ b/src/transformers/models/detr/configuration_detr.py
@@ -0,0 +1,205 @@
+# coding=utf-8
+# Copyright 2021 Facebook AI Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" DETR model configuration """
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/detr-resnet-50": "https://huggingface.co/facebook/detr-resnet-50/resolve/main/config.json",
+    # See all DETR models at https://huggingface.co/models?filter=detr
+}
+
+
+class DetrConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.DetrModel`. It is used to
+    instantiate a DETR model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the DETR `facebook/detr-resnet-50
+    <https://huggingface.co/facebook/detr-resnet-50>`__ architecture.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+
+    Args:
+        num_queries (:obj:`int`, `optional`, defaults to 100):
+            Number of object queries, i.e. detection slots. This is the maximal number of objects
+            :class:`~transformers.DetrModel` can detect in a single image. For COCO, we recommend 100 queries.
+        d_model (:obj:`int`, `optional`, defaults to 256):
+            Dimension of the layers.
+        encoder_layers (:obj:`int`, `optional`, defaults to 6):
+            Number of encoder layers.
+        decoder_layers (:obj:`int`, `optional`, defaults to 6):
+            Number of decoder layers.
+        encoder_attention_heads (:obj:`int`, `optional`, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (:obj:`int`, `optional`, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        init_xavier_std (:obj:`float`, `optional`, defaults to 1):
+            The scaling factor used for the Xavier initialization gain in the HM Attention map module.
+        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
+            https://arxiv.org/abs/1909.11556>`__ for more details.
+        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
+            https://arxiv.org/abs/1909.11556>`__ for more details.
+        auxiliary_loss (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
+        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"sine"`):
+            Type of position embeddings to be used on top of the image features. One of :obj:`"sine"` or
+            :obj:`"learned"`.
+        backbone (:obj:`str`, `optional`, defaults to :obj:`"resnet50"`):
+            Name of convolutional backbone to use. Supports any convolutional backbone from the timm package. For a
+            list of all available models, see `this page
+            <https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model>`__.
+        dilation (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to replace stride with dilation in the last convolutional block (DC5).
+        class_cost (:obj:`float`, `optional`, defaults to 1):
+            Relative weight of the classification error in the Hungarian matching cost.
+        bbox_cost (:obj:`float`, `optional`, defaults to 5):
+            Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
+        giou_cost (:obj:`float`, `optional`, defaults to 2):
+            Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
+        mask_loss_coefficient (:obj:`float`, `optional`, defaults to 1):
+            Relative weight of the Focal loss in the panoptic segmentation loss.
+        dice_loss_coefficient (:obj:`float`, `optional`, defaults to 1):
+            Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
+        bbox_loss_coefficient (:obj:`float`, `optional`, defaults to 5):
+            Relative weight of the L1 bounding box loss in the object detection loss.
+        giou_loss_coefficient (:obj:`float`, `optional`, defaults to 2):
+            Relative weight of the generalized IoU loss in the object detection loss.
+        eos_coefficient (:obj:`float`, `optional`, defaults to 0.1):
+            Relative classification weight of the 'no-object' class in the object detection loss.
+
+    Examples::
+
+        >>> from transformers import DetrModel, DetrConfig
+
+        >>> # Initializing a DETR facebook/detr-resnet-50 style configuration
+        >>> configuration = DetrConfig()
+
+        >>> # Initializing a model from the facebook/detr-resnet-50 style configuration
+        >>> model = DetrModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
+    model_type = "detr"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        num_queries=100,
+        max_position_embeddings=1024,
+        encoder_layers=6,
+        encoder_ffn_dim=2048,
+        encoder_attention_heads=8,
+        decoder_layers=6,
+        decoder_ffn_dim=2048,
+        decoder_attention_heads=8,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        is_encoder_decoder=True,
+        activation_function="relu",
+        d_model=256,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        init_xavier_std=1.0,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        auxiliary_loss=False,
+        position_embedding_type="sine",
+        backbone="resnet50",
+        dilation=False,
+        class_cost=1,
+        bbox_cost=5,
+        giou_cost=2,
+        mask_loss_coefficient=1,
+        dice_loss_coefficient=1,
+        bbox_loss_coefficient=5,
+        giou_loss_coefficient=2,
+        eos_coefficient=0.1,
+        **kwargs
+    ):
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+        self.num_queries = num_queries
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.init_xavier_std = init_xavier_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.auxiliary_loss = auxiliary_loss
+        self.position_embedding_type = position_embedding_type
+        self.backbone = backbone
+        self.dilation = dilation
+        # Hungarian matcher
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        # Loss coefficients
+        self.mask_loss_coefficient = mask_loss_coefficient
+        self.dice_loss_coefficient = dice_loss_coefficient
+        self.bbox_loss_coefficient = bbox_loss_coefficient
+        self.giou_loss_coefficient = giou_loss_coefficient
+        self.eos_coefficient = eos_coefficient
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
diff --git a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 00000000000..66165809759
--- /dev/null
+++ b/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,273 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert DETR checkpoints."""
+
+
+import argparse
+from collections import OrderedDict
+from pathlib import Path
+
+import torch
+from PIL import Image
+
+import requests
+from transformers import DetrConfig, DetrFeatureExtractor, DetrForObjectDetection, DetrForSegmentation
+from transformers.utils import logging
+from transformers.utils.coco_classes import id2label
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+rename_keys = []
+for i in range(6):
+    # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
+    rename_keys.append(
+        (f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias")
+    )
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias"))
+    rename_keys.append(
+        (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight")
+    )
+    rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias"))
+    # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias")
+    )
+    rename_keys.append(
+        (
+            f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight",
+            f"decoder.layers.{i}.encoder_attn.out_proj.weight",
+        )
+    )
+    rename_keys.append(
+        (
+            f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias",
+            f"decoder.layers.{i}.encoder_attn.out_proj.bias",
+        )
+    )
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight")
+    )
+    rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias")
+    )
+    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
+
+# convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
+rename_keys.extend(
+    [
+        ("input_proj.weight", "input_projection.weight"),
+        ("input_proj.bias", "input_projection.bias"),
+        ("query_embed.weight", "query_position_embeddings.weight"),
+        ("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
+        ("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
+        ("class_embed.weight", "class_labels_classifier.weight"),
+        ("class_embed.bias", "class_labels_classifier.bias"),
+        ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
+        ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
+        ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
+        ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
+        ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
+        ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
+    ]
+)
+
+
+def rename_key(state_dict, old, new):
+    val = state_dict.pop(old)
+    state_dict[new] = val
+
+
+def rename_backbone_keys(state_dict):
+    new_state_dict = OrderedDict()
+    for key, value in state_dict.items():
+        if "backbone.0.body" in key:
+            new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model")
+            new_state_dict[new_key] = value
+        else:
+            new_state_dict[key] = value
+
+    return new_state_dict
+
+
+def read_in_q_k_v(state_dict, is_panoptic=False):
+    prefix = ""
+    if is_panoptic:
+        prefix = "detr."
+
+    # first: transformer encoder
+    for i in range(6):
+        # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
+        in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
+        state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
+        state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
+        state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
+        state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
+        state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
+    # next: transformer decoder (which is a bit more complex because it also includes cross-attention)
+    for i in range(6):
+        # read in weights + bias of input projection layer of self-attention
+        in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight")
+        in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
+        state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
+        state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
+        state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
+        state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
+        state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
+        # read in weights + bias of input projection layer of cross-attention
+        in_proj_weight_cross_attn = state_dict.pop(
+            f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight"
+        )
+        in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) of cross-attention to the state dict
+        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :]
+        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256]
+        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :]
+        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512]
+        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :]
+        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:]
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+
+    return im
+
+
+@torch.no_grad()
+def convert_detr_checkpoint(model_name, pytorch_dump_folder_path):
+    """
+    Copy/paste/tweak model's weights to our DETR structure.
+    """
+
+    # load default config
+    config = DetrConfig()
+    # set backbone and dilation attributes
+    if "resnet101" in model_name:
+        config.backbone = "resnet101"
+    if "dc5" in model_name:
+        config.dilation = True
+    is_panoptic = "panoptic" in model_name
+    if is_panoptic:
+        config.num_labels = 250
+    else:
+        config.num_labels = 91
+        config.id2label = id2label
+        config.label2id = {v: k for k, v in id2label.items()}
+
+    # load feature extractor
+    format = "coco_panoptic" if is_panoptic else "coco_detection"
+    feature_extractor = DetrFeatureExtractor(format=format)
+
+    # prepare image
+    img = prepare_img()
+    encoding = feature_extractor(images=img, return_tensors="pt")
+    pixel_values = encoding["pixel_values"]
+
+    logger.info(f"Converting model {model_name}...")
+
+    # load original model from torch hub
+    detr = torch.hub.load("facebookresearch/detr", model_name, pretrained=True).eval()
+    state_dict = detr.state_dict()
+    # rename keys
+    for src, dest in rename_keys:
+        if is_panoptic:
+            src = "detr." + src
+        rename_key(state_dict, src, dest)
+    state_dict = rename_backbone_keys(state_dict)
+    # query, key and value matrices need special treatment
+    read_in_q_k_v(state_dict, is_panoptic=is_panoptic)
+    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
+    prefix = "detr.model." if is_panoptic else "model."
+    for key in state_dict.copy().keys():
+        if is_panoptic:
+            if (
+                key.startswith("detr")
+                and not key.startswith("class_labels_classifier")
+                and not key.startswith("bbox_predictor")
+            ):
+                val = state_dict.pop(key)
+                state_dict["detr.model" + key[4:]] = val
+            elif "class_labels_classifier" in key or "bbox_predictor" in key:
+                val = state_dict.pop(key)
+                state_dict["detr." + key] = val
+            elif key.startswith("bbox_attention") or key.startswith("mask_head"):
+                continue
+            else:
+                val = state_dict.pop(key)
+                state_dict[prefix + key] = val
+        else:
+            if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
+                val = state_dict.pop(key)
+                state_dict[prefix + key] = val
+    # finally, create HuggingFace model and load state dict
+    model = DetrForSegmentation(config) if is_panoptic else DetrForObjectDetection(config)
+    model.load_state_dict(state_dict)
+    model.eval()
+    # verify our conversion
+    original_outputs = detr(pixel_values)
+    outputs = model(pixel_values)
+    assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4)
+    assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4)
+    if is_panoptic:
+        assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
+
+    # Save model and feature extractor
+    logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    model.save_pretrained(pytorch_dump_folder_path)
+    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--model_name", default="detr_resnet50", type=str, help="Name of the DETR model you'd like to convert."
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
+    )
+    args = parser.parse_args()
+    convert_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/detr/feature_extraction_detr.py b/src/transformers/models/detr/feature_extraction_detr.py
new file mode 100644
index 00000000000..7c9b5526dc8
--- /dev/null
+++ b/src/transformers/models/detr/feature_extraction_detr.py
@@ -0,0 +1,890 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for DETR."""
+
+import io
+import pathlib
+from collections import defaultdict
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+from PIL import Image
+
+from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
+from ...file_utils import TensorType, is_torch_available
+from ...image_utils import ImageFeatureExtractionMixin, is_torch_tensor
+from ...utils import logging
+
+
+if is_torch_available():
+    import torch
+    import torch.nn.functional as F
+
+logger = logging.get_logger(__name__)
+
+
+ImageInput = Union[Image.Image, np.ndarray, "torch.Tensor", List[Image.Image], List[np.ndarray], List["torch.Tensor"]]
+
+
+# 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
+def center_to_corners_format(x):
+    """
+    Converts a PyTorch tensor of bounding boxes of center format (center_x, center_y, width, height) to corners format
+    (x_0, y_0, x_1, y_1).
+    """
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return torch.stack(b, dim=-1)
+
+
+def corners_to_center_format(x):
+    """
+    Converts a NumPy array of bounding boxes of shape (number of bounding boxes, 4) of corners format (x_0, y_0, x_1,
+    y_1) to center format (center_x, center_y, width, height).
+    """
+    x_transposed = x.T
+    x0, y0, x1, y1 = x_transposed[0], x_transposed[1], x_transposed[2], x_transposed[3]
+    b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]
+    return np.stack(b, axis=-1)
+
+
+def masks_to_boxes(masks):
+    """
+    Compute the bounding boxes around the provided panoptic segmentation masks.
+
+    The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
+
+    Returns a [N, 4] tensor, with the boxes in corner (xyxy) format.
+    """
+    if masks.size == 0:
+        return np.zeros((0, 4))
+
+    h, w = masks.shape[-2:]
+
+    y = np.arange(0, h, dtype=np.float32)
+    x = np.arange(0, w, dtype=np.float32)
+    # see https://github.com/pytorch/pytorch/issues/50276
+    y, x = np.meshgrid(y, x, indexing="ij")
+
+    x_mask = masks * np.expand_dims(x, axis=0)
+    x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
+    x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
+    x_min = x.filled(fill_value=1e8)
+    x_min = x_min.reshape(x_min.shape[0], -1).min(-1)
+
+    y_mask = masks * np.expand_dims(y, axis=0)
+    y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
+    y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
+    y_min = y.filled(fill_value=1e8)
+    y_min = y_min.reshape(y_min.shape[0], -1).min(-1)
+
+    return np.stack([x_min, y_min, x_max, y_max], 1)
+
+
+# 2 functions below copied from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
+# Copyright (c) 2018, Alexander Kirillov
+# All rights reserved.
+def rgb_to_id(color):
+    if isinstance(color, np.ndarray) and len(color.shape) == 3:
+        if color.dtype == np.uint8:
+            color = color.astype(np.int32)
+        return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
+    return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
+
+
+def id_to_rgb(id_map):
+    if isinstance(id_map, np.ndarray):
+        id_map_copy = id_map.copy()
+        rgb_shape = tuple(list(id_map.shape) + [3])
+        rgb_map = np.zeros(rgb_shape, dtype=np.uint8)
+        for i in range(3):
+            rgb_map[..., i] = id_map_copy % 256
+            id_map_copy //= 256
+        return rgb_map
+    color = []
+    for _ in range(3):
+        color.append(id_map % 256)
+        id_map //= 256
+    return color
+
+
+class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
+    r"""
+    Constructs a DETR feature extractor.
+
+    This feature extractor inherits from :class:`~transformers.FeatureExtractionMixin` which contains most of the main
+    methods. Users should refer to this superclass for more information regarding those methods.
+
+
+    Args:
+        format (:obj:`str`, `optional`, defaults to :obj:`"coco_detection"`):
+            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+        do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to resize the input to a certain :obj:`size`.
+        size (:obj:`int`, `optional`, defaults to 800):
+            Resize the input to the given size. Only has an effect if :obj:`do_resize` is set to :obj:`True`. If size
+            is a sequence like :obj:`(width, height)`, output size will be matched to this. If size is an int, smaller
+            edge of the image will be matched to this number. i.e, if :obj:`height > width`, then image will be
+            rescaled to :obj:`(size * height / width, size)`.
+        max_size (:obj:`int`, `optional`, defaults to :obj:`1333`):
+            The largest size an image dimension can have (otherwise it's capped). Only has an effect if
+            :obj:`do_resize` is set to :obj:`True`.
+        do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to normalize the input with mean and standard deviation.
+        image_mean (:obj:`int`, `optional`, defaults to :obj:`[0.485, 0.456, 0.406]s`):
+            The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean.
+        image_std (:obj:`int`, `optional`, defaults to :obj:`[0.229, 0.224, 0.225]`):
+            The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the
+            ImageNet std.
+    """
+
+    model_input_names = ["pixel_values", "pixel_mask"]
+
+    def __init__(
+        self,
+        format="coco_detection",
+        do_resize=True,
+        size=800,
+        max_size=1333,
+        do_normalize=True,
+        image_mean=None,
+        image_std=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.format = self._is_valid_format(format)
+        self.do_resize = do_resize
+        self.size = size
+        self.max_size = max_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else [0.485, 0.456, 0.406]  # ImageNet mean
+        self.image_std = image_std if image_std is not None else [0.229, 0.224, 0.225]  # ImageNet std
+
+    def _is_valid_format(self, format):
+        if format not in ["coco_detection", "coco_panoptic"]:
+            raise ValueError(f"Format {format} not supported")
+        return format
+
+    def prepare(self, image, target, return_segmentation_masks=False, masks_path=None):
+        if self.format == "coco_detection":
+            image, target = self.prepare_coco_detection(image, target, return_segmentation_masks)
+            return image, target
+        elif self.format == "coco_panoptic":
+            image, target = self.prepare_coco_panoptic(image, target, masks_path)
+            return image, target
+        else:
+            raise ValueError(f"Format {self.format} not supported")
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L33
+    def convert_coco_poly_to_mask(self, segmentations, height, width):
+
+        try:
+            from pycocotools import mask as coco_mask
+        except ImportError:
+            raise ImportError("Pycocotools is not installed in your environment.")
+
+        masks = []
+        for polygons in segmentations:
+            rles = coco_mask.frPyObjects(polygons, height, width)
+            mask = coco_mask.decode(rles)
+            if len(mask.shape) < 3:
+                mask = mask[..., None]
+            mask = np.asarray(mask, dtype=np.uint8)
+            mask = np.any(mask, axis=2)
+            masks.append(mask)
+        if masks:
+            masks = np.stack(masks, axis=0)
+        else:
+            masks = np.zeros((0, height, width), dtype=np.uint8)
+
+        return masks
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L50
+    def prepare_coco_detection(self, image, target, return_segmentation_masks=False):
+        """
+        Convert the target in COCO format into the format expected by DETR.
+        """
+        w, h = image.size
+
+        image_id = target["image_id"]
+        image_id = np.asarray([image_id], dtype=np.int64)
+
+        # get all COCO annotations for the given image
+        anno = target["annotations"]
+
+        anno = [obj for obj in anno if "iscrowd" not in obj or obj["iscrowd"] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=w)
+        boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=h)
+
+        classes = [obj["category_id"] for obj in anno]
+        classes = np.asarray(classes, dtype=np.int64)
+
+        if return_segmentation_masks:
+            segmentations = [obj["segmentation"] for obj in anno]
+            masks = self.convert_coco_poly_to_mask(segmentations, h, w)
+
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = np.asarray(keypoints, dtype=np.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.reshape((-1, 3))
+
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        if return_segmentation_masks:
+            masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["class_labels"] = classes
+        if return_segmentation_masks:
+            target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+
+        # for conversion to coco api
+        area = np.asarray([obj["area"] for obj in anno], dtype=np.float32)
+        iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno], dtype=np.int64)
+        target["area"] = area[keep]
+        target["iscrowd"] = iscrowd[keep]
+
+        target["orig_size"] = np.asarray([int(h), int(w)], dtype=np.int64)
+        target["size"] = np.asarray([int(h), int(w)], dtype=np.int64)
+
+        return image, target
+
+    def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
+        w, h = image.size
+        ann_info = target.copy()
+        ann_path = pathlib.Path(masks_path) / ann_info["file_name"]
+
+        if "segments_info" in ann_info:
+            masks = np.asarray(Image.open(ann_path), dtype=np.uint32)
+            masks = rgb_to_id(masks)
+
+            ids = np.array([ann["id"] for ann in ann_info["segments_info"]])
+            masks = masks == ids[:, None, None]
+            masks = np.asarray(masks, dtype=np.uint8)
+
+            labels = np.asarray([ann["category_id"] for ann in ann_info["segments_info"]], dtype=np.int64)
+
+        target = {}
+        target["image_id"] = np.asarray(
+            [ann_info["image_id"] if "image_id" in ann_info else ann_info["id"]], dtype=np.int64
+        )
+        if return_masks:
+            target["masks"] = masks
+        target["class_labels"] = labels
+
+        target["boxes"] = masks_to_boxes(masks)
+
+        target["size"] = np.asarray([int(h), int(w)], dtype=np.int64)
+        target["orig_size"] = np.asarray([int(h), int(w)], dtype=np.int64)
+        if "segments_info" in ann_info:
+            target["iscrowd"] = np.asarray([ann["iscrowd"] for ann in ann_info["segments_info"]], dtype=np.int64)
+            target["area"] = np.asarray([ann["area"] for ann in ann_info["segments_info"]], dtype=np.float32)
+
+        return image, target
+
+    def _resize(self, image, size, target=None, max_size=None):
+        """
+        Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller
+        edge of the image will be matched to this number.
+
+        If given, also resize the target accordingly.
+        """
+        if not isinstance(image, Image.Image):
+            image = self.to_pil_image(image)
+
+        def get_size_with_aspect_ratio(image_size, size, max_size=None):
+            w, h = image_size
+            if max_size is not None:
+                min_original_size = float(min((w, h)))
+                max_original_size = float(max((w, h)))
+                if max_original_size / min_original_size * size > max_size:
+                    size = int(round(max_size * min_original_size / max_original_size))
+
+            if (w <= h and w == size) or (h <= w and h == size):
+                return (h, w)
+
+            if w < h:
+                ow = size
+                oh = int(size * h / w)
+            else:
+                oh = size
+                ow = int(size * w / h)
+
+            return (oh, ow)
+
+        def get_size(image_size, size, max_size=None):
+            if isinstance(size, (list, tuple)):
+                return size
+            else:
+                # size returned must be (w, h) since we use PIL to resize images
+                # so we revert the tuple
+                return get_size_with_aspect_ratio(image_size, size, max_size)[::-1]
+
+        size = get_size(image.size, size, max_size)
+        rescaled_image = self.resize(image, size=size)
+
+        if target is None:
+            return rescaled_image, None
+
+        ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+        ratio_width, ratio_height = ratios
+
+        target = target.copy()
+        if "boxes" in target:
+            boxes = target["boxes"]
+            scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
+            target["boxes"] = scaled_boxes
+
+        if "area" in target:
+            area = target["area"]
+            scaled_area = area * (ratio_width * ratio_height)
+            target["area"] = scaled_area
+
+        w, h = size
+        target["size"] = np.asarray([h, w], dtype=np.int64)
+
+        if "masks" in target:
+            # use PyTorch as current workaround
+            # TODO replace by self.resize
+            masks = torch.from_numpy(target["masks"][:, None]).float()
+            interpolated_masks = F.interpolate(masks, size=(h, w), mode="nearest")[:, 0] > 0.5
+            target["masks"] = interpolated_masks.numpy()
+
+        return rescaled_image, target
+
+    def _normalize(self, image, mean, std, target=None):
+        """
+        Normalize the image with a certain mean and std.
+
+        If given, also normalize the target bounding boxes based on the size of the image.
+        """
+
+        image = self.normalize(image, mean=mean, std=std)
+        if target is None:
+            return image, None
+
+        target = target.copy()
+        h, w = image.shape[-2:]
+
+        if "boxes" in target:
+            boxes = target["boxes"]
+            boxes = corners_to_center_format(boxes)
+            boxes = boxes / np.asarray([w, h, w, h], dtype=np.float32)
+            target["boxes"] = boxes
+
+        return image, target
+
+    def __call__(
+        self,
+        images: ImageInput,
+        annotations: Union[List[Dict], List[List[Dict]]] = None,
+        return_segmentation_masks: Optional[bool] = False,
+        masks_path: Optional[pathlib.Path] = None,
+        pad_and_return_pixel_mask: Optional[bool] = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several image(s) and optional annotations. Images are by default
+        padded up to the largest image in a batch, and a pixel mask is created that indicates which pixels are
+        real/which are padding.
+
+        .. warning::
+
+           NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+           PIL images.
+
+        Args:
+            images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+
+            annotations (:obj:`Dict`, :obj:`List[Dict]`, `optional`):
+                The corresponding annotations in COCO format.
+
+                In case :class:`~transformers.DetrFeatureExtractor` was initialized with :obj:`format =
+                "coco_detection"`, the annotations for each image should have the following format: {'image_id': int,
+                'annotations': [annotation]}, with the annotations being a list of COCO object annotations.
+
+                In case :class:`~transformers.DetrFeatureExtractor` was initialized with :obj:`format =
+                "coco_panoptic"`, the annotations for each image should have the following format: {'image_id': int,
+                'file_name': str, 'segments_info': [segment_info]} with segments_info being a list of COCO panoptic
+                annotations.
+
+            return_segmentation_masks (:obj:`Dict`, :obj:`List[Dict]`, `optional`, defaults to :obj:`False`):
+                Whether to also return instance segmentation masks in case :obj:`format = "coco_detection"`.
+
+            masks_path (:obj:`pathlib.Path`, `optional`):
+                Path to the directory containing the PNG files that store the class-agnostic image segmentations. Only
+                relevant in case :class:`~transformers.DetrFeatureExtractor` was initialized with :obj:`format =
+                "coco_panoptic"`.
+
+            pad_and_return_pixel_mask (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                Whether or not to pad images up to the largest image in a batch and create a pixel mask.
+
+                If left to the default, will return a pixel mask that is:
+
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+
+            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+                If set, will return tensors instead of NumPy arrays. If set to :obj:`'pt'`, return PyTorch
+                :obj:`torch.Tensor` objects.
+
+        Returns:
+            :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model.
+            - **pixel_mask** -- Pixel mask to be fed to a model (when :obj:`pad_and_return_pixel_mask=True` or if
+              `"pixel_mask"` is in :obj:`self.model_input_names`).
+        """
+        # Input type checking for clearer error
+
+        valid_images = False
+        valid_annotations = False
+        valid_masks_path = False
+
+        # Check that images has a valid type
+        if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
+            valid_images = True
+        elif isinstance(images, (list, tuple)):
+            if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
+                valid_images = True
+
+        if not valid_images:
+            raise ValueError(
+                "Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example),"
+                "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
+            )
+
+        is_batched = bool(
+            isinstance(images, (list, tuple))
+            and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
+        )
+
+        # Check that annotations has a valid type
+        if annotations is not None:
+            if not is_batched:
+                if self.format == "coco_detection":
+                    if isinstance(annotations, dict) and "image_id" in annotations and "annotations" in annotations:
+                        if isinstance(annotations["annotations"], (list, tuple)):
+                            # an image can have no annotations
+                            if len(annotations["annotations"]) == 0 or isinstance(annotations["annotations"][0], dict):
+                                valid_annotations = True
+                elif self.format == "coco_panoptic":
+                    if isinstance(annotations, dict) and "image_id" in annotations and "segments_info" in annotations:
+                        if isinstance(annotations["segments_info"], (list, tuple)):
+                            # an image can have no segments (?)
+                            if len(annotations["segments_info"]) == 0 or isinstance(
+                                annotations["segments_info"][0], dict
+                            ):
+                                valid_annotations = True
+            else:
+                if isinstance(annotations, (list, tuple)):
+                    assert len(images) == len(annotations), "There must be as many annotations as there are images"
+                    if isinstance(annotations[0], Dict):
+                        if self.format == "coco_detection":
+                            if isinstance(annotations[0]["annotations"], (list, tuple)):
+                                valid_annotations = True
+                        elif self.format == "coco_panoptic":
+                            if isinstance(annotations[0]["segments_info"], (list, tuple)):
+                                valid_annotations = True
+
+            if not valid_annotations:
+                raise ValueError(
+                    """
+                    Annotations must of type `Dict` (single image) or `List[Dict]` (batch of images). In case of object
+                    detection, each dictionary should contain the keys 'image_id' and 'annotations', with the latter
+                    being a list of annotations in COCO format. In case of panoptic segmentation, each dictionary
+                    should contain the keys 'file_name', 'image_id' and 'segments_info', with the latter being a list
+                    of annotations in COCO format.
+                    """
+                )
+
+        # Check that masks_path has a valid type
+        if masks_path is not None:
+            if self.format == "coco_panoptic":
+                if isinstance(masks_path, pathlib.Path):
+                    valid_masks_path = True
+                if not valid_masks_path:
+                    raise ValueError(
+                        "The path to the directory containing the mask PNG files should be provided as a `pathlib.Path` object."
+                    )
+
+        if not is_batched:
+            images = [images]
+            if annotations is not None:
+                annotations = [annotations]
+
+        # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+        if annotations is not None:
+            for idx, (image, target) in enumerate(zip(images, annotations)):
+                if not isinstance(image, Image.Image):
+                    image = self.to_pil_image(image)
+                image, target = self.prepare(image, target, return_segmentation_masks, masks_path)
+                images[idx] = image
+                annotations[idx] = target
+
+        # transformations (resizing + normalization)
+        if self.do_resize and self.size is not None:
+            if annotations is not None:
+                for idx, (image, target) in enumerate(zip(images, annotations)):
+                    image, target = self._resize(image=image, target=target, size=self.size, max_size=self.max_size)
+                    images[idx] = image
+                    annotations[idx] = target
+            else:
+                for idx, image in enumerate(images):
+                    images[idx] = self._resize(image=image, target=None, size=self.size, max_size=self.max_size)[0]
+
+        if self.do_normalize:
+            if annotations is not None:
+                for idx, (image, target) in enumerate(zip(images, annotations)):
+                    image, target = self._normalize(
+                        image=image, mean=self.image_mean, std=self.image_std, target=target
+                    )
+                    images[idx] = image
+                    annotations[idx] = target
+            else:
+                images = [
+                    self._normalize(image=image, mean=self.image_mean, std=self.image_std)[0] for image in images
+                ]
+
+        if pad_and_return_pixel_mask:
+            # pad images up to largest image in batch and create pixel_mask
+            max_size = self._max_by_axis([list(image.shape) for image in images])
+            c, h, w = max_size
+            padded_images = []
+            pixel_mask = []
+            for image in images:
+                # create padded image
+                padded_image = np.zeros((c, h, w), dtype=np.float32)
+                padded_image[: image.shape[0], : image.shape[1], : image.shape[2]] = np.copy(image)
+                padded_images.append(padded_image)
+                # create pixel mask
+                mask = np.zeros((h, w), dtype=np.int64)
+                mask[: image.shape[1], : image.shape[2]] = True
+                pixel_mask.append(mask)
+            images = padded_images
+
+        # return as BatchFeature
+        data = {}
+        data["pixel_values"] = images
+        if pad_and_return_pixel_mask:
+            data["pixel_mask"] = pixel_mask
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        if annotations is not None:
+            # Convert to TensorType
+            tensor_type = return_tensors
+            if not isinstance(tensor_type, TensorType):
+                tensor_type = TensorType(tensor_type)
+
+            if not tensor_type == TensorType.PYTORCH:
+                raise ValueError("Only PyTorch is supported for the moment.")
+            else:
+                if not is_torch_available():
+                    raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
+
+                encoded_inputs["target"] = [
+                    {k: torch.from_numpy(v) for k, v in target.items()} for target in annotations
+                ]
+
+        return encoded_inputs
+
+    def _max_by_axis(self, the_list):
+        # type: (List[List[int]]) -> List[int]
+        maxes = the_list[0]
+        for sublist in the_list[1:]:
+            for index, item in enumerate(sublist):
+                maxes[index] = max(maxes[index], item)
+        return maxes
+
+    def pad_and_create_pixel_mask(
+        self, pixel_values_list: List["torch.Tensor"], return_tensors: Optional[Union[str, TensorType]] = None
+    ):
+        """
+        Pad images up to the largest image in a batch and create a corresponding :obj:`pixel_mask`.
+
+        Args:
+            pixel_values_list (:obj:`List[torch.Tensor]`):
+                List of images (pixel values) to be padded. Each image should be a tensor of shape (C, H, W).
+            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+                If set, will return tensors instead of NumPy arrays. If set to :obj:`'pt'`, return PyTorch
+                :obj:`torch.Tensor` objects.
+
+        Returns:
+            :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model.
+            - **pixel_mask** -- Pixel mask to be fed to a model (when :obj:`pad_and_return_pixel_mask=True` or if
+              `"pixel_mask"` is in :obj:`self.model_input_names`).
+
+        """
+
+        max_size = self._max_by_axis([list(image.shape) for image in pixel_values_list])
+        c, h, w = max_size
+        padded_images = []
+        pixel_mask = []
+        for image in pixel_values_list:
+            # create padded image
+            padded_image = np.zeros((c, h, w), dtype=np.float32)
+            padded_image[: image.shape[0], : image.shape[1], : image.shape[2]] = np.copy(image)
+            padded_images.append(padded_image)
+            # create pixel mask
+            mask = np.zeros((h, w), dtype=np.int64)
+            mask[: image.shape[1], : image.shape[2]] = True
+            pixel_mask.append(mask)
+
+        # return as BatchFeature
+        data = {"pixel_values": padded_images, "pixel_mask": pixel_mask}
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        return encoded_inputs
+
+    # POSTPROCESSING METHODS
+    # inspired by https://github.com/facebookresearch/detr/blob/master/models/detr.py#L258
+    def post_process(self, outputs, target_sizes):
+        """
+        Converts the output of :class:`~transformers.DetrForObjectDetection` into the format expected by the COCO api.
+        Only supports PyTorch.
+
+        Args:
+            outputs (:class:`~transformers.DetrObjectDetectionOutput`):
+                Raw outputs of the model.
+            target_sizes (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)`, `optional`):
+                Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
+                image size (before any data augmentation). For visualization, this should be the image size after data
+                augment, but before padding.
+
+        Returns:
+            :obj:`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an
+            image in the batch as predicted by the model.
+        """
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        assert len(out_logits) == len(
+            target_sizes
+        ), "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+        assert (
+            target_sizes.shape[1] == 2
+        ), "Each element of target_sizes must contain the size (h, w) of each image of the batch"
+
+        prob = F.softmax(out_logits, -1)
+        scores, labels = prob[..., :-1].max(-1)
+
+        # convert to [x0, y0, x1, y1] format
+        boxes = center_to_corners_format(out_bbox)
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
+
+        return results
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218
+    def post_process_segmentation(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
+        """
+        Converts the output of :class:`~transformers.DetrForSegmentation` into actual instance segmentation
+        predictions. Only supports PyTorch.
+
+        Args:
+            results (:obj:`List[Dict]`):
+                Results list obtained by :meth:`~transformers.DetrFeatureExtractor.post_process`, to which "masks"
+                results will be added.
+            outputs (:class:`~transformers.DetrSegmentationOutput`):
+                Raw outputs of the model.
+            orig_target_sizes (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)`):
+                Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
+                image size (before any data augmentation).
+            max_target_sizes (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)`):
+                Tensor containing the maximum size (h, w) of each image of the batch. For evaluation, this must be the
+                original image size (before any data augmentation).
+            threshold (:obj:`float`, `optional`, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+
+        Returns:
+            :obj:`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks
+            for an image in the batch as predicted by the model.
+        """
+
+        assert len(orig_target_sizes) == len(
+            max_target_sizes
+        ), "Make sure to pass in as many orig_target_sizes as max_target_sizes"
+        max_h, max_w = max_target_sizes.max(0)[0].tolist()
+        outputs_masks = outputs.pred_masks.squeeze(2)
+        outputs_masks = F.interpolate(outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False)
+        outputs_masks = (outputs_masks.sigmoid() > threshold).cpu()
+
+        for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)):
+            img_h, img_w = t[0], t[1]
+            results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1)
+            results[i]["masks"] = F.interpolate(
+                results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest"
+            ).byte()
+
+        return results
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
+    def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85):
+        """
+        Converts the output of :class:`~transformers.DetrForSegmentation` into actual panoptic predictions. Only
+        supports PyTorch.
+
+        Parameters:
+            outputs (:class:`~transformers.DetrSegmentationOutput`):
+                Raw outputs of the model.
+            processed_sizes (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)` or :obj:`List[Tuple]` of length :obj:`batch_size`):
+                Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data
+                augmentation but before batching.
+            target_sizes (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)` or :obj:`List[Tuple]` of length :obj:`batch_size`, `optional`):
+                Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction. If left to
+                None, it will default to the :obj:`processed_sizes`.
+            is_thing_map (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)`, `optional`):
+                Dictionary mapping class indices to either True or False, depending on whether or not they are a thing.
+                If not set, defaults to the :obj:`is_thing_map` of COCO panoptic.
+            threshold (:obj:`float`, `optional`, defaults to 0.85):
+                Threshold to use to filter out queries.
+
+        Returns:
+            :obj:`List[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values
+            for an image in the batch as predicted by the model.
+        """
+        if target_sizes is None:
+            target_sizes = processed_sizes
+        assert len(processed_sizes) == len(
+            target_sizes
+        ), "Make sure to pass in as many processed_sizes as target_sizes"
+
+        if is_thing_map is None:
+            # default to is_thing_map of COCO panoptic
+            is_thing_map = {i: i <= 90 for i in range(201)}
+
+        out_logits, raw_masks, raw_boxes = outputs.logits, outputs.pred_masks, outputs.pred_boxes
+        assert (
+            len(out_logits) == len(raw_masks) == len(target_sizes)
+        ), "Make sure that you pass in as many target sizes as the batch dimension of the logits and masks"
+        preds = []
+
+        def to_tuple(tup):
+            if isinstance(tup, tuple):
+                return tup
+            return tuple(tup.cpu().tolist())
+
+        for cur_logits, cur_masks, cur_boxes, size, target_size in zip(
+            out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes
+        ):
+            # we filter empty queries and detection below threshold
+            scores, labels = cur_logits.softmax(-1).max(-1)
+            keep = labels.ne(outputs.logits.shape[-1] - 1) & (scores > threshold)
+            cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
+            cur_scores = cur_scores[keep]
+            cur_classes = cur_classes[keep]
+            cur_masks = cur_masks[keep]
+            cur_masks = F.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
+            cur_boxes = center_to_corners_format(cur_boxes[keep])
+
+            h, w = cur_masks.shape[-2:]
+            assert len(cur_boxes) == len(cur_classes), "Not as many boxes as there are classes"
+
+            # It may be that we have several predicted masks for the same stuff class.
+            # In the following, we track the list of masks ids for each stuff class (they are merged later on)
+            cur_masks = cur_masks.flatten(1)
+            stuff_equiv_classes = defaultdict(lambda: [])
+            for k, label in enumerate(cur_classes):
+                if not is_thing_map[label.item()]:
+                    stuff_equiv_classes[label.item()].append(k)
+
+            def get_ids_area(masks, scores, dedup=False):
+                # This helper function creates the final panoptic segmentation image
+                # It also returns the area of the masks that appears on the image
+
+                m_id = masks.transpose(0, 1).softmax(-1)
+
+                if m_id.shape[-1] == 0:
+                    # We didn't detect any mask :(
+                    m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device)
+                else:
+                    m_id = m_id.argmax(-1).view(h, w)
+
+                if dedup:
+                    # Merge the masks corresponding to the same stuff class
+                    for equiv in stuff_equiv_classes.values():
+                        if len(equiv) > 1:
+                            for eq_id in equiv:
+                                m_id.masked_fill_(m_id.eq(eq_id), equiv[0])
+
+                final_h, final_w = to_tuple(target_size)
+
+                seg_img = Image.fromarray(id_to_rgb(m_id.view(h, w).cpu().numpy()))
+                seg_img = seg_img.resize(size=(final_w, final_h), resample=Image.NEAREST)
+
+                np_seg_img = torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes()))
+                np_seg_img = np_seg_img.view(final_h, final_w, 3)
+                np_seg_img = np_seg_img.numpy()
+
+                m_id = torch.from_numpy(rgb_to_id(np_seg_img))
+
+                area = []
+                for i in range(len(scores)):
+                    area.append(m_id.eq(i).sum().item())
+                return area, seg_img
+
+            area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True)
+            if cur_classes.numel() > 0:
+                # We know filter empty masks as long as we find some
+                while True:
+                    filtered_small = torch.as_tensor(
+                        [area[i] <= 4 for i, c in enumerate(cur_classes)], dtype=torch.bool, device=keep.device
+                    )
+                    if filtered_small.any().item():
+                        cur_scores = cur_scores[~filtered_small]
+                        cur_classes = cur_classes[~filtered_small]
+                        cur_masks = cur_masks[~filtered_small]
+                        area, seg_img = get_ids_area(cur_masks, cur_scores)
+                    else:
+                        break
+
+            else:
+                cur_classes = torch.ones(1, dtype=torch.long, device=cur_classes.device)
+
+            segments_info = []
+            for i, a in enumerate(area):
+                cat = cur_classes[i].item()
+                segments_info.append({"id": i, "isthing": is_thing_map[cat], "category_id": cat, "area": a})
+            del cur_classes
+
+            with io.BytesIO() as out:
+                seg_img.save(out, format="PNG")
+                predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
+            preds.append(predictions)
+        return preds
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
new file mode 100644
index 00000000000..b876e1be8fe
--- /dev/null
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -0,0 +1,2267 @@
+# coding=utf-8
+# Copyright 2021 Facebook AI Research The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch DETR model. """
+
+
+import math
+import random
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from ...activations import ACT2FN
+from ...file_utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_scipy_available,
+    is_timm_available,
+    is_vision_available,
+    replace_return_docstrings,
+    requires_backends,
+)
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import logging
+from .configuration_detr import DetrConfig
+
+
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+
+if is_vision_available():
+    from .feature_extraction_detr import center_to_corners_format
+
+if is_timm_available():
+    from timm import create_model
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "DetrConfig"
+
+DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/detr-resnet-50",
+    # See all DETR models at https://huggingface.co/models?filter=detr
+]
+
+
+@dataclass
+class DetrDecoderOutput(BaseModelOutputWithCrossAttentions):
+    """
+    Base class for outputs of the DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions,
+    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
+    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+
+    Args:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
+            each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` and ``config.add_cross_attention=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
+            attention softmax, used to compute the weighted average in the cross-attention heads.
+        intermediate_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(config.decoder_layers, batch_size, num_queries, hidden_size)`, `optional`, returned when ``config.auxiliary_loss=True``):
+            Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
+            layernorm.
+    """
+
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+class DetrModelOutput(Seq2SeqModelOutput):
+    """
+    Base class for outputs of the DETR encoder-decoder model. This class adds one attribute to Seq2SeqModelOutput,
+    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
+    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+
+    Args:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model. If
+            :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1,
+            hidden_size)` is output.
+        decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of
+            each layer plus the initial embedding outputs.
+        decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to
+            compute the weighted average in the self-attention heads.
+        cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
+            attention softmax, used to compute the weighted average in the cross-attention heads.
+        encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of
+            each layer plus the initial embedding outputs.
+        encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to
+            compute the weighted average in the self-attention heads.
+        intermediate_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(config.decoder_layers, batch_size, sequence_length, hidden_size)`, `optional`, returned when ``config.auxiliary_loss=True``):
+            Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
+            layernorm.
+    """
+
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+class DetrObjectDetectionOutput(ModelOutput):
+    """
+    Output type of :class:`~transformers.DetrForObjectDetection`.
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` are provided)):
+            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+            scale-invariant IoU loss.
+        loss_dict (:obj:`Dict`, `optional`):
+            A dictionary containing the individual losses. Useful for logging.
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, num_classes + 1)`):
+            Classification logits (including no-object) for all queries.
+        pred_boxes (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, 4)`):
+            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+            possible padding). You can use :class:`~transformers.DetrForObjectDetection.post_process` to retrieve the
+            unnormalized bounding boxes.
+        auxiliary_outputs (:obj:`list[Dict]`, `optional`):
+            Optional, only returned when auxilary losses are activated (i.e. :obj:`config.auxiliary_loss` is set to
+            `True`) and labels are provided. It is a list of dictionnaries containing the two above keys (:obj:`logits`
+            and :obj:`pred_boxes`) for each decoder layer.
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+
+            If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size,
+            1, hidden_size)` is output.
+        decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of
+            each layer plus the initial embedding outputs.
+        decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to
+            compute the weighted average in the self-attention heads.
+        cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
+            attention softmax, used to compute the weighted average in the cross-attention heads.
+        encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of
+            each layer plus the initial embedding outputs.
+        encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to
+            compute the weighted average in the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[Dict] = None
+    logits: torch.FloatTensor = None
+    pred_boxes: torch.FloatTensor = None
+    auxiliary_outputs: Optional[List[Dict]] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class DetrSegmentationOutput(ModelOutput):
+    """
+    Output type of :class:`~transformers.DetrForSegmentation`.
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` are provided)):
+            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+            scale-invariant IoU loss.
+        loss_dict (:obj:`Dict`, `optional`):
+            A dictionary containing the individual losses. Useful for logging.
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, num_classes + 1)`):
+            Classification logits (including no-object) for all queries.
+        pred_boxes (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, 4)`):
+            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+            possible padding). You can use :meth:`~transformers.DetrForObjectDetection.post_process` to retrieve the
+            unnormalized bounding boxes.
+        pred_masks (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, width, height)`):
+            Segmentation masks for all queries. See also
+            :meth:`~transformers.DetrFeatureExtractor.post_process_segmentation` or
+            :meth:`~transformers.DetrFeatureExtractor.post_process_panoptic` to evaluate instance and panoptic
+            segmentation masks respectively.
+        auxiliary_outputs (:obj:`list[Dict]`, `optional`):
+            Optional, only returned when auxilary losses are activated (i.e. :obj:`config.auxiliary_loss` is set to
+            `True`) and labels are provided. It is a list of dictionnaries containing the two above keys (:obj:`logits`
+            and :obj:`pred_boxes`) for each decoder layer.
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+
+            If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size,
+            1, hidden_size)` is output.
+        decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of
+            each layer plus the initial embedding outputs.
+        decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to
+            compute the weighted average in the self-attention heads.
+        cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
+            attention softmax, used to compute the weighted average in the cross-attention heads.
+        encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of
+            each layer plus the initial embedding outputs.
+        encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to
+            compute the weighted average in the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[Dict] = None
+    logits: torch.FloatTensor = None
+    pred_boxes: torch.FloatTensor = None
+    pred_masks: torch.FloatTensor = None
+    auxiliary_outputs: Optional[List[Dict]] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+# BELOW: utilities copied from
+# https://github.com/facebookresearch/detr/blob/master/backbone.py
+class DetrFrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
+    torchvision.models.resnet[18,34,50,101] produce nans.
+    """
+
+    def __init__(self, n):
+        super(DetrFrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super(DetrFrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it user-friendly
+        weight = self.weight.reshape(1, -1, 1, 1)
+        bias = self.bias.reshape(1, -1, 1, 1)
+        running_var = self.running_var.reshape(1, -1, 1, 1)
+        running_mean = self.running_mean.reshape(1, -1, 1, 1)
+        epsilon = 1e-5
+        scale = weight * (running_var + epsilon).rsqrt()
+        bias = bias - running_mean * scale
+        return x * scale + bias
+
+
+def replace_batch_norm(m, name=""):
+    for attr_str in dir(m):
+        target_attr = getattr(m, attr_str)
+        if isinstance(target_attr, torch.nn.BatchNorm2d):
+            frozen = DetrFrozenBatchNorm2d(target_attr.num_features)
+            bn = getattr(m, attr_str)
+            frozen.weight.data.copy_(bn.weight)
+            frozen.bias.data.copy_(bn.bias)
+            frozen.running_mean.data.copy_(bn.running_mean)
+            frozen.running_var.data.copy_(bn.running_var)
+            setattr(m, attr_str, frozen)
+    for n, ch in m.named_children():
+        replace_batch_norm(ch, n)
+
+
+class DetrTimmConvEncoder(nn.Module):
+    """
+    Convolutional encoder (backbone) from the timm library.
+
+    nn.BatchNorm2d layers are replaced by DetrFrozenBatchNorm2d as defined above.
+
+    """
+
+    def __init__(self, name: str, dilation: bool):
+        super().__init__()
+
+        kwargs = {}
+        if dilation:
+            kwargs["output_stride"] = 16
+
+        requires_backends(self, ["timm"])
+
+        backbone = create_model(name, pretrained=True, features_only=True, out_indices=(1, 2, 3, 4), **kwargs)
+        # replace batch norm by frozen batch norm
+        with torch.no_grad():
+            replace_batch_norm(backbone)
+        self.model = backbone
+        self.intermediate_channel_sizes = self.model.feature_info.channels()
+
+        if "resnet" in name:
+            for name, parameter in self.model.named_parameters():
+                if "layer2" not in name and "layer3" not in name and "layer4" not in name:
+                    parameter.requires_grad_(False)
+
+    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
+        # send pixel_values through the model to get list of feature maps
+        features = self.model(pixel_values)
+
+        out = []
+        for feature_map in features:
+            # downsample pixel_mask to match shape of corresponding feature_map
+            mask = F.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
+            out.append((feature_map, mask))
+        return out
+
+
+class DetrConvModel(nn.Module):
+    """
+    This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
+    """
+
+    def __init__(self, conv_encoder, position_embedding):
+        super().__init__()
+        self.conv_encoder = conv_encoder
+        self.position_embedding = position_embedding
+
+    def forward(self, pixel_values, pixel_mask):
+        # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples
+        out = self.conv_encoder(pixel_values, pixel_mask)
+        pos = []
+        for feature_map, mask in out:
+            # position encoding
+            pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype))
+
+        return out, pos
+
+
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
+
+
+class DetrSinePositionEmbedding(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
+    need paper, generalized to work on images.
+    """
+
+    def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, pixel_values, pixel_mask):
+        assert pixel_mask is not None, "No pixel mask provided"
+        y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
+        x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            y_embed = y_embed / (y_embed[:, -1:, :] + 1e-6) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale
+
+        dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.embedding_dim)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+
+class DetrLearnedPositionEmbedding(nn.Module):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, embedding_dim=256):
+        super().__init__()
+        self.row_embeddings = nn.Embedding(50, embedding_dim)
+        self.column_embeddings = nn.Embedding(50, embedding_dim)
+
+    def forward(self, pixel_values, pixel_mask=None):
+        h, w = pixel_values.shape[-2:]
+        i = torch.arange(w, device=pixel_values.device)
+        j = torch.arange(h, device=pixel_values.device)
+        x_emb = self.column_embeddings(i)
+        y_emb = self.row_embeddings(j)
+        pos = torch.cat([x_emb.unsqueeze(0).repeat(h, 1, 1), y_emb.unsqueeze(1).repeat(1, w, 1)], dim=-1)
+        pos = pos.permute(2, 0, 1)
+        pos = pos.unsqueeze(0)
+        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
+        return pos
+
+
+def build_position_encoding(config):
+    n_steps = config.d_model // 2
+    if config.position_embedding_type == "sine":
+        # TODO find a better way of exposing other arguments
+        position_embedding = DetrSinePositionEmbedding(n_steps, normalize=True)
+    elif config.position_embedding_type == "learned":
+        position_embedding = DetrLearnedPositionEmbedding(n_steps)
+    else:
+        raise ValueError(f"Not supported {config.position_embedding_type}")
+
+    return position_embedding
+
+
+class DetrAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+
+    Here, we add position embeddings to the queries and keys (as explained in the DETR paper).
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+        self.scaling = self.head_dim ** -0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        key_value_states: Optional[torch.Tensor] = None,
+        key_value_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states_original = hidden_states
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        # add key-value position embeddings to the key value states
+        if key_value_position_embeddings is not None:
+            key_value_states_original = key_value_states
+            key_value_states = self.with_pos_embed(key_value_states, key_value_position_embeddings)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states_original), -1, bsz)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states_original), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = F.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+class DetrEncoderLayer(nn.Module):
+    def __init__(self, config: DetrConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = DetrAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: torch.Tensor = None,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape :obj:`(seq_len, batch, embed_dim)`
+            attention_mask (:obj:`torch.FloatTensor`): attention mask of size
+                :obj:`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            position_embeddings (:obj:`torch.FloatTensor`, `optional`): position embeddings, to be added to hidden_states.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class DetrDecoderLayer(nn.Module):
+    def __init__(self, config: DetrConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = DetrAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = DetrAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        query_position_embeddings: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ):
+        """
+        Args:
+            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape :obj:`(seq_len, batch, embed_dim)`
+            attention_mask (:obj:`torch.FloatTensor`): attention mask of size
+                :obj:`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            position_embeddings (:obj:`torch.FloatTensor`, `optional`): position embeddings that are added to the queries and keys
+            in the cross-attention layer.
+            query_position_embeddings (:obj:`torch.FloatTensor`, `optional`): position embeddings that are added to the queries and keys
+            in the self-attention layer.
+            encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape :obj:`(seq_len, batch, embed_dim)`
+            encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
+                :obj:`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=query_position_embeddings,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            hidden_states, cross_attn_weights = self.encoder_attn(
+                hidden_states=hidden_states,
+                position_embeddings=query_position_embeddings,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                key_value_position_embeddings=position_embeddings,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+class DetrClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
+class DetrPreTrainedModel(PreTrainedModel):
+    config_class = DetrConfig
+    base_model_prefix = "model"
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        xavier_std = self.config.init_xavier_std
+
+        if isinstance(module, DetrMHAttentionMap):
+            nn.init.zeros_(module.k_linear.bias)
+            nn.init.zeros_(module.q_linear.bias)
+            nn.init.xavier_uniform_(module.k_linear.weight, gain=xavier_std)
+            nn.init.xavier_uniform_(module.q_linear.weight, gain=xavier_std)
+        elif isinstance(module, DetrLearnedPositionEmbedding):
+            nn.init.uniform_(module.row_embeddings.weight)
+            nn.init.uniform_(module.column_embeddings.weight)
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+DETR_START_DOCSTRING = r"""
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~transformers.DetrConfig`):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+DETR_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it.
+
+            Pixel values can be obtained using :class:`~transformers.DetrTokenizer`. See
+            :meth:`transformers.DetrTokenizer.__call__` for details.
+
+        pixel_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, height, width)`, `optional`):
+            Mask to avoid performing attention on padding pixel values. Mask values selected in ``[0, 1]``:
+
+            - 1 for pixels that are real (i.e. **not masked**),
+            - 0 for pixels that are padding (i.e. **masked**).
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+
+        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_queries)`, `optional`):
+            Not used by default. Can be used to mask object queries.
+        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
+            Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
+            :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+            `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+            cross-attention of the decoder.
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+            can choose to directly pass a flattened representation of an image.
+        decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, hidden_size)`, `optional`):
+            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+            embedded representation.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+class DetrEncoder(DetrPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    :class:`DetrEncoderLayer`.
+
+    The encoder updates the flattened feature map through multiple self-attention layers.
+
+    Small tweak for DETR:
+
+    - position_embeddings are added to the forward pass.
+
+    Args:
+        config: DetrConfig
+        embed_tokens (torch.nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: DetrConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        embed_dim = config.d_model
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+        self.layers = nn.ModuleList([DetrEncoderLayer(config) for _ in range(config.encoder_layers)])
+
+        # in the original DETR, no layernorm is used for the Encoder, as "normalize_before" is set to False by default there
+
+        self.init_weights()
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        position_embeddings=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+
+            attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in ``[0, 1]``:
+
+                - 1 for pixel features that are real (i.e. **not masked**),
+                - 0 for pixel features that are padding (i.e. **masked**).
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+
+            position_embeddings (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = inputs_embeds
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for i, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                # we add position_embeddings as extra input to the encoder_layer
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    position_embeddings=position_embeddings,
+                    output_attentions=output_attentions,
+                )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class DetrDecoder(DetrPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`DetrDecoderLayer`.
+
+    The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
+
+    Some small tweaks for DETR:
+
+    - position_embeddings and query_position_embeddings are added to the forward pass.
+    - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.
+
+    Args:
+        config: DetrConfig
+        embed_tokens (torch.nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: DetrConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        self.layers = nn.ModuleList([DetrDecoderLayer(config) for _ in range(config.decoder_layers)])
+        # in DETR, the decoder uses layernorm after the last decoder layer output
+        self.layernorm = nn.LayerNorm(config.d_model)
+
+        self.init_weights()
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings=None,
+        query_position_embeddings=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+
+            attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on certain queries. Mask values selected in ``[0, 1]``:
+
+                - 1 for queries that are **not masked**,
+                - 0 for queries that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+                Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
+                in ``[0, 1]``:
+
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+
+            position_embeddings (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+                Position embeddings that are added to the queries and keys in each cross-attention layer.
+            query_position_embeddings (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, hidden_size)`):, `optional`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+            input_shape = inputs_embeds.size()[:-1]
+
+        combined_attention_mask = None
+
+        if attention_mask is not None and combined_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            combined_attention_mask = combined_attention_mask + _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        # optional intermediate hidden states
+        intermediate = () if self.config.auxiliary_loss else None
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    combined_attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=combined_attention_mask,
+                    position_embeddings=position_embeddings,
+                    query_position_embeddings=query_position_embeddings,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if self.config.auxiliary_loss:
+                hidden_states = self.layernorm(hidden_states)
+                intermediate += (hidden_states,)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # finally, apply layernorm
+        hidden_states = self.layernorm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        # stack intermediate decoder activations
+        if self.config.auxiliary_loss:
+            intermediate = torch.stack(intermediate)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions, intermediate]
+                if v is not None
+            )
+        return DetrDecoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+            intermediate_hidden_states=intermediate,
+        )
+
+
+@add_start_docstrings(
+    """
+    The bare DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
+    any specific head on top.
+    """,
+    DETR_START_DOCSTRING,
+)
+class DetrModel(DetrPreTrainedModel):
+    def __init__(self, config: DetrConfig):
+        super().__init__(config)
+
+        # Create backbone + positional encoding
+        backbone = DetrTimmConvEncoder(config.backbone, config.dilation)
+        position_embeddings = build_position_encoding(config)
+        self.backbone = DetrConvModel(backbone, position_embeddings)
+
+        # Create projection layer
+        self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1)
+
+        self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
+
+        self.encoder = DetrEncoder(config)
+        self.decoder = DetrDecoder(config)
+
+        self.init_weights()
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def freeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(False)
+
+    def unfreeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(True)
+
+    @add_start_docstrings_to_model_forward(DETR_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=DetrModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values,
+        pixel_mask=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+
+        Examples::
+
+            >>> from transformers import DetrFeatureExtractor, DetrModel
+            >>> from PIL import Image
+            >>> import requests
+
+            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+            >>> image = Image.open(requests.get(url, stream=True).raw)
+
+            >>> feature_extractor = DetrFeatureExtractor.from_pretrained('facebook/detr-resnet-50')
+            >>> model = DetrModel.from_pretrained('facebook/detr-resnet-50')
+            >>> inputs = feature_extractor(images=image, return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> last_hidden_states = outputs.last_hidden_state
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones(((batch_size, height, width)), device=device)
+
+        # First, sent pixel_values + pixel_mask through Backbone to obtain the features
+        # pixel_values should be of shape (batch_size, num_channels, height, width)
+        # pixel_mask should be of shape (batch_size, height, width)
+        features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
+
+        # get final feature map and downsampled mask
+        feature_map, mask = features[-1]
+
+        assert mask is not None, "Backbone does not return downsampled pixel mask"
+
+        # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        projected_feature_map = self.input_projection(feature_map)
+
+        # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
+        # In other words, turn their shape into (batch_size, sequence_length, hidden_size)
+        flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1)
+        position_embeddings = position_embeddings_list[-1].flatten(2).permute(0, 2, 1)
+
+        flattened_mask = mask.flatten(1)
+
+        # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder
+        # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size)
+        # flattened_mask is a Tensor of shape (batch_size, heigth*width)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                inputs_embeds=flattened_features,
+                attention_mask=flattened_mask,
+                position_embeddings=position_embeddings,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # Fifth, sent query embeddings + position embeddings through the decoder (which is conditioned on the encoder output)
+        query_position_embeddings = self.query_position_embeddings.weight.unsqueeze(0).repeat(batch_size, 1, 1)
+        queries = torch.zeros_like(query_position_embeddings)
+
+        # decoder outputs consists of (dec_features, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            inputs_embeds=queries,
+            attention_mask=None,
+            position_embeddings=position_embeddings,
+            query_position_embeddings=query_position_embeddings,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=flattened_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return DetrModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """
+    DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
+    such as COCO detection.
+    """,
+    DETR_START_DOCSTRING,
+)
+class DetrForObjectDetection(DetrPreTrainedModel):
+    def __init__(self, config: DetrConfig):
+        super().__init__(config)
+
+        # DETR encoder-decoder model
+        self.model = DetrModel(config)
+
+        # Object detection heads
+        self.class_labels_classifier = nn.Linear(
+            config.d_model, config.num_labels + 1
+        )  # We add one for the "no object" class
+        self.bbox_predictor = DetrMLPPredictionHead(
+            input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+        )
+
+        self.init_weights()
+
+    # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
+
+    @add_start_docstrings_to_model_forward(DETR_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=DetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values,
+        pixel_mask=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`List[Dict]` of len :obj:`(batch_size,)`, `optional`):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing 2 keys:
+            'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch respectively). The
+            class labels themselves should be a :obj:`torch.LongTensor` of len :obj:`(number of bounding boxes in the
+            image,)` and the boxes a :obj:`torch.FloatTensor` of shape :obj:`(number of bounding boxes in the image,
+            4)`.
+
+        Returns:
+
+        Examples::
+
+            >>> from transformers import DetrFeatureExtractor, DetrForObjectDetection
+            >>> from PIL import Image
+            >>> import requests
+
+            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+            >>> image = Image.open(requests.get(url, stream=True).raw)
+
+            >>> feature_extractor = DetrFeatureExtractor.from_pretrained('facebook/detr-resnet-50')
+            >>> model = DetrForObjectDetection.from_pretrained('facebook/detr-resnet-50')
+
+            >>> inputs = feature_extractor(images=image, return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> # model predicts bounding boxes and corresponding COCO classes
+            >>> logits = outputs.logits
+            >>> bboxes = outputs.pred_boxes
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # First, sent images through DETR base model to obtain encoder + decoder outputs
+        outputs = self.model(
+            pixel_values,
+            pixel_mask=pixel_mask,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        # class logits + predicted bounding boxes
+        logits = self.class_labels_classifier(sequence_output)
+        pred_boxes = self.bbox_predictor(sequence_output).sigmoid()
+
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            # First: create the matcher
+            matcher = DetrHungarianMatcher(
+                class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
+            )
+            # Second: create the criterion
+            losses = ["labels", "boxes", "cardinality"]
+            criterion = DetrLoss(
+                matcher=matcher,
+                num_classes=self.config.num_labels,
+                eos_coef=self.config.eos_coefficient,
+                losses=losses,
+            )
+            criterion.to(self.device)
+            # Third: compute the losses, based on outputs and labels
+            outputs_loss = {}
+            outputs_loss["logits"] = logits
+            outputs_loss["pred_boxes"] = pred_boxes
+            if self.config.auxiliary_loss:
+                intermediate = outputs.intermediate_hidden_states if return_dict else outputs[4]
+                outputs_class = self.class_labels_classifier(intermediate)
+                outputs_coord = self.bbox_predictor(intermediate).sigmoid()
+                auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
+                outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+
+            loss_dict = criterion(outputs_loss, labels)
+            # Fourth: compute total loss, as a weighted sum of the various losses
+            weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient}
+            weight_dict["loss_giou"] = self.config.giou_loss_coefficient
+            if self.config.auxiliary_loss:
+                aux_weight_dict = {}
+                for i in range(self.config.decoder_layers - 1):
+                    aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+                weight_dict.update(aux_weight_dict)
+            loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes) + auxiliary_outputs + outputs
+            else:
+                output = (logits, pred_boxes) + outputs
+            return ((loss, loss_dict) + output) if loss is not None else output
+
+        return DetrObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=outputs.last_hidden_state,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top, for tasks
+    such as COCO panoptic.
+
+    """,
+    DETR_START_DOCSTRING,
+)
+class DetrForSegmentation(DetrPreTrainedModel):
+    def __init__(self, config: DetrConfig):
+        super().__init__(config)
+
+        # object detection model
+        self.detr = DetrForObjectDetection(config)
+
+        # segmentation head
+        hidden_size, number_of_heads = config.d_model, config.encoder_attention_heads
+        intermediate_channel_sizes = self.detr.model.backbone.conv_encoder.intermediate_channel_sizes
+
+        self.mask_head = DetrMaskHeadSmallConv(
+            hidden_size + number_of_heads, intermediate_channel_sizes[::-1][-3:], hidden_size
+        )
+
+        self.bbox_attention = DetrMHAttentionMap(
+            hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std
+        )
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(DETR_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=DetrSegmentationOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values,
+        pixel_mask=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`List[Dict]` of len :obj:`(batch_size,)`, `optional`):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing 3 keys:
+            'class_labels', 'boxes' and 'masks' (the class labels, bounding boxes and segmentation masks of an image in
+            the batch respectively). The class labels themselves should be a :obj:`torch.LongTensor` of len
+            :obj:`(number of bounding boxes in the image,)`, the boxes a :obj:`torch.FloatTensor` of shape
+            :obj:`(number of bounding boxes in the image, 4)` and the masks a :obj:`torch.FloatTensor` of shape
+            :obj:`(number of bounding boxes in the image, 4)`.
+
+        Returns:
+
+        Examples::
+
+            >>> from transformers import DetrFeatureExtractor, DetrForSegmentation
+            >>> from PIL import Image
+            >>> import requests
+
+            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+            >>> image = Image.open(requests.get(url, stream=True).raw)
+
+            >>> feature_extractor = DetrFeatureExtractor.from_pretrained('facebook/detr-resnet-50-panoptic')
+            >>> model = DetrForSegmentation.from_pretrained('facebook/detr-resnet-50-panoptic')
+
+            >>> inputs = feature_extractor(images=image, return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> # model predicts COCO classes, bounding boxes, and masks
+            >>> logits = outputs.logits
+            >>> bboxes = outputs.pred_boxes
+            >>> masks = outputs.pred_masks
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones((batch_size, height, width), device=device)
+
+        # First, get list of feature maps and position embeddings
+        features, position_embeddings_list = self.detr.model.backbone(pixel_values, pixel_mask=pixel_mask)
+
+        # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        feature_map, mask = features[-1]
+        batch_size, num_channels, height, width = feature_map.shape
+        projected_feature_map = self.detr.model.input_projection(feature_map)
+
+        # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
+        # In other words, turn their shape into (batch_size, sequence_length, hidden_size)
+        flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1)
+        position_embeddings = position_embeddings_list[-1].flatten(2).permute(0, 2, 1)
+
+        flattened_mask = mask.flatten(1)
+
+        # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder
+        # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size)
+        # flattened_mask is a Tensor of shape (batch_size, heigth*width)
+        if encoder_outputs is None:
+            encoder_outputs = self.detr.model.encoder(
+                inputs_embeds=flattened_features,
+                attention_mask=flattened_mask,
+                position_embeddings=position_embeddings,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # Fifth, sent query embeddings + position embeddings through the decoder (which is conditioned on the encoder output)
+        query_position_embeddings = self.detr.model.query_position_embeddings.weight.unsqueeze(0).repeat(
+            batch_size, 1, 1
+        )
+        queries = torch.zeros_like(query_position_embeddings)
+
+        # decoder outputs consists of (dec_features, dec_hidden, dec_attn)
+        decoder_outputs = self.detr.model.decoder(
+            inputs_embeds=queries,
+            attention_mask=None,
+            position_embeddings=position_embeddings,
+            query_position_embeddings=query_position_embeddings,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=flattened_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        # Sixth, compute logits, pred_boxes and pred_masks
+        logits = self.detr.class_labels_classifier(sequence_output)
+        pred_boxes = self.detr.bbox_predictor(sequence_output).sigmoid()
+
+        memory = encoder_outputs[0].permute(0, 2, 1).view(batch_size, self.config.d_model, height, width)
+        mask = flattened_mask.view(batch_size, height, width)
+
+        # FIXME h_boxes takes the last one computed, keep this in mind
+        # important: we need to reverse the mask, since in the original implementation the mask works reversed
+        # bbox_mask is of shape (batch_size, num_queries, number_of_attention_heads in bbox_attention, height/32, width/32)
+        bbox_mask = self.bbox_attention(sequence_output, memory, mask=~mask)
+
+        seg_masks = self.mask_head(projected_feature_map, bbox_mask, [features[2][0], features[1][0], features[0][0]])
+
+        pred_masks = seg_masks.view(batch_size, self.detr.config.num_queries, seg_masks.shape[-2], seg_masks.shape[-1])
+
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            # First: create the matcher
+            matcher = DetrHungarianMatcher(
+                class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
+            )
+            # Second: create the criterion
+            losses = ["labels", "boxes", "cardinality", "masks"]
+            criterion = DetrLoss(
+                matcher=matcher,
+                num_classes=self.config.num_labels,
+                eos_coef=self.config.eos_coefficient,
+                losses=losses,
+            )
+            criterion.to(self.device)
+            # Third: compute the losses, based on outputs and labels
+            outputs_loss = {}
+            outputs_loss["logits"] = logits
+            outputs_loss["pred_boxes"] = pred_boxes
+            outputs_loss["pred_masks"] = pred_masks
+            if self.config.auxiliary_loss:
+                intermediate = decoder_outputs.intermediate_hidden_states if return_dict else decoder_outputs[-1]
+                outputs_class = self.class_labels_classifier(intermediate)
+                outputs_coord = self.bbox_predictor(intermediate).sigmoid()
+                auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
+                outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+
+            loss_dict = criterion(outputs_loss, labels)
+            # Fourth: compute total loss, as a weighted sum of the various losses
+            weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient}
+            weight_dict["loss_giou"] = self.config.giou_loss_coefficient
+            weight_dict["loss_mask"] = self.config.mask_loss_coefficient
+            weight_dict["loss_dice"] = self.config.dice_loss_coefficient
+            if self.config.auxiliary_loss:
+                aux_weight_dict = {}
+                for i in range(self.config.decoder_layers - 1):
+                    aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+                weight_dict.update(aux_weight_dict)
+            loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes, pred_masks) + auxiliary_outputs + decoder_outputs + encoder_outputs
+            else:
+                output = (logits, pred_boxes, pred_masks) + decoder_outputs + encoder_outputs
+            return ((loss, loss_dict) + output) if loss is not None else output
+
+        return DetrSegmentationOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            pred_masks=pred_masks,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+def _expand(tensor, length: int):
+    return tensor.unsqueeze(1).repeat(1, int(length), 1, 1, 1).flatten(0, 1)
+
+
+# taken from https://github.com/facebookresearch/detr/blob/master/models/segmentation.py
+class DetrMaskHeadSmallConv(nn.Module):
+    """
+    Simple convolutional head, using group norm. Upsampling is done using a FPN approach
+    """
+
+    def __init__(self, dim, fpn_dims, context_dim):
+        super().__init__()
+
+        assert (
+            dim % 8 == 0
+        ), "The hidden_size + number of attention heads must be divisible by 8 as the number of groups in GroupNorm is set to 8"
+
+        inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64]
+
+        self.lay1 = torch.nn.Conv2d(dim, dim, 3, padding=1)
+        self.gn1 = torch.nn.GroupNorm(8, dim)
+        self.lay2 = torch.nn.Conv2d(dim, inter_dims[1], 3, padding=1)
+        self.gn2 = torch.nn.GroupNorm(8, inter_dims[1])
+        self.lay3 = torch.nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1)
+        self.gn3 = torch.nn.GroupNorm(8, inter_dims[2])
+        self.lay4 = torch.nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1)
+        self.gn4 = torch.nn.GroupNorm(8, inter_dims[3])
+        self.lay5 = torch.nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1)
+        self.gn5 = torch.nn.GroupNorm(8, inter_dims[4])
+        self.out_lay = torch.nn.Conv2d(inter_dims[4], 1, 3, padding=1)
+
+        self.dim = dim
+
+        self.adapter1 = torch.nn.Conv2d(fpn_dims[0], inter_dims[1], 1)
+        self.adapter2 = torch.nn.Conv2d(fpn_dims[1], inter_dims[2], 1)
+        self.adapter3 = torch.nn.Conv2d(fpn_dims[2], inter_dims[3], 1)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_uniform_(m.weight, a=1)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]):
+        # here we concatenate x, the projected feature map, of shape (batch_size, d_model, heigth/32, width/32) with
+        # the bbox_mask = the attention maps of shape (batch_size, n_queries, n_heads, height/32, width/32).
+        # We expand the projected feature map to match the number of heads.
+        x = torch.cat([_expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1)
+
+        x = self.lay1(x)
+        x = self.gn1(x)
+        x = F.relu(x)
+        x = self.lay2(x)
+        x = self.gn2(x)
+        x = F.relu(x)
+
+        cur_fpn = self.adapter1(fpns[0])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay3(x)
+        x = self.gn3(x)
+        x = F.relu(x)
+
+        cur_fpn = self.adapter2(fpns[1])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay4(x)
+        x = self.gn4(x)
+        x = F.relu(x)
+
+        cur_fpn = self.adapter3(fpns[2])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay5(x)
+        x = self.gn5(x)
+        x = F.relu(x)
+
+        x = self.out_lay(x)
+        return x
+
+
+class DetrMHAttentionMap(nn.Module):
+    """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
+
+    def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True, std=None):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.dropout = nn.Dropout(dropout)
+
+        self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
+        self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
+
+        self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5
+
+    def forward(self, q, k, mask: Optional[Tensor] = None):
+        q = self.q_linear(q)
+        k = F.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias)
+        queries_per_head = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads)
+        keys_per_head = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1])
+        weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head)
+
+        if mask is not None:
+            weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), float("-inf"))
+        weights = F.softmax(weights.flatten(2), dim=-1).view(weights.size())
+        weights = self.dropout(weights)
+        return weights
+
+
+def dice_loss(inputs, targets, num_boxes):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs (0 for the negative class and 1 for the positive
+                 class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * (inputs * targets).sum(1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_boxes
+
+
+def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs (0 for the negative class and 1 for the positive
+                 class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    return loss.mean(1).sum() / num_boxes
+
+
+# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+class DetrLoss(nn.Module):
+    """
+    This class computes the losses for DetrForObjectDetection/DetrForSegmentation. The process happens in two steps: 1)
+    we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair
+    of matched ground-truth / prediction (supervise class and box)
+    """
+
+    def __init__(self, matcher, num_classes, eos_coef, losses):
+        """
+        Create the criterion.
+
+        A note on the num_classes parameter (copied from original repo in detr.py): "the naming of the `num_classes`
+        parameter of the criterion is somewhat misleading. it indeed corresponds to `max_obj_id + 1`, where max_obj_id
+        is the maximum id for a class in your dataset. For example, COCO has a max_obj_id of 90, so we pass
+        `num_classes` to be 91. As another example, for a dataset that has a single class with id 1, you should pass
+        `num_classes` to be 2 (max_obj_id + 1). For more details on this, check the following discussion
+        https://github.com/facebookresearch/detr/issues/108#issuecomment-650269223"
+
+        Parameters:
+            matcher: module able to compute a matching between targets and proposals.
+            num_classes: number of object categories, omitting the special no-object category.
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            eos_coef: relative classification weight applied to the no-object category.
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.eos_coef = eos_coef
+        self.losses = losses
+        empty_weight = torch.ones(self.num_classes + 1)
+        empty_weight[-1] = self.eos_coef
+        self.register_buffer("empty_weight", empty_weight)
+
+    # removed logging parameter, which was part of the original implementation
+    def loss_labels(self, outputs, targets, indices, num_boxes):
+        """
+        Classification loss (NLL) targets dicts must contain the key "class_labels" containing a tensor of dim
+        [nb_target_boxes]
+        """
+        assert "logits" in outputs, "No logits were found in the outputs"
+        src_logits = outputs["logits"]
+
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(
+            src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device
+        )
+        target_classes[idx] = target_classes_o
+
+        loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight)
+        losses = {"loss_ce": loss_ce}
+
+        return losses
+
+    @torch.no_grad()
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
+
+        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
+        """
+        logits = outputs["logits"]
+        device = logits.device
+        tgt_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
+        card_err = F.l1_loss(card_pred.float(), tgt_lengths.float())
+        losses = {"cardinality_error": card_err}
+        return losses
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
+
+        Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
+        are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        assert "pred_boxes" in outputs, "No predicted boxes found in outputs"
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs["pred_boxes"][idx]
+        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction="none")
+
+        losses = {}
+        losses["loss_bbox"] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - torch.diag(
+            generalized_box_iou(center_to_corners_format(src_boxes), center_to_corners_format(target_boxes))
+        )
+        losses["loss_giou"] = loss_giou.sum() / num_boxes
+        return losses
+
+    def loss_masks(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the losses related to the masks: the focal loss and the dice loss.
+
+        Targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w].
+        """
+        assert "pred_masks" in outputs, "No predicted masks found in outputs"
+
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        src_masks = outputs["pred_masks"]
+        src_masks = src_masks[src_idx]
+        masks = [t["masks"] for t in targets]
+        # TODO use valid to mask invalid areas due to padding in loss
+        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
+        target_masks = target_masks.to(src_masks)
+        target_masks = target_masks[tgt_idx]
+
+        # upsample predictions to the target size
+        src_masks = F.interpolate(
+            src_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False
+        )
+        src_masks = src_masks[:, 0].flatten(1)
+
+        target_masks = target_masks.flatten(1)
+        target_masks = target_masks.view(src_masks.shape)
+        losses = {
+            "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes),
+            "loss_dice": dice_loss(src_masks, target_masks, num_boxes),
+        }
+        return losses
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def get_loss(self, loss, outputs, targets, indices, num_boxes):
+        loss_map = {
+            "labels": self.loss_labels,
+            "cardinality": self.loss_cardinality,
+            "boxes": self.loss_boxes,
+            "masks": self.loss_masks,
+        }
+        assert loss in loss_map, f"Loss {loss} not supported"
+        return loss_map[loss](outputs, targets, indices, num_boxes)
+
+    def forward(self, outputs, targets):
+        """
+        This performs the loss computation.
+
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"}
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t["class_labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        # (Niels): comment out function below, distributed training to be added
+        # if is_dist_avail_and_initialized():
+        #     torch.distributed.all_reduce(num_boxes)
+        # (Niels) in original implementation, num_boxes is divided by get_world_size()
+        num_boxes = torch.clamp(num_boxes, min=1).item()
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "auxiliary_outputs" in outputs:
+            for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
+                indices = self.matcher(auxiliary_outputs, targets)
+                for loss in self.losses:
+                    if loss == "masks":
+                        # Intermediate masks losses are too costly to compute, we ignore them.
+                        continue
+                    l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
+                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        return losses
+
+
+# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+class DetrMLPPredictionHead(nn.Module):
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+
+    """
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+# taken from https://github.com/facebookresearch/detr/blob/master/models/matcher.py
+class DetrHungarianMatcher(nn.Module):
+    """
+    This class computes an assignment between the targets and the predictions of the network.
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
+    predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
+    un-matched (and thus treated as non-objects).
+    """
+
+    def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
+        """
+        Creates the matcher.
+
+        Params:
+            class_cost: This is the relative weight of the classification error in the matching cost
+            bbox_cost: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
+            giou_cost: This is the relative weight of the giou loss of the bounding box in the matching cost
+        """
+        super().__init__()
+
+        requires_backends(self, ["scipy"])
+
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        assert class_cost != 0 or bbox_cost != 0 or giou_cost != 0, "All costs of the Matcher can't be 0"
+
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """
+        Performs the matching.
+
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                 objects in the target) containing the class labels "boxes": Tensor of dim [num_target_boxes, 4]
+                 containing the target box coordinates
+
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        bs, num_queries = outputs["logits"].shape[:2]
+
+        # We flatten to compute the cost matrices in a batch
+        out_prob = outputs["logits"].flatten(0, 1).softmax(-1)  # [batch_size * num_queries, num_classes]
+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+
+        # Also concat the target labels and boxes
+        tgt_ids = torch.cat([v["class_labels"] for v in targets])
+        tgt_bbox = torch.cat([v["boxes"] for v in targets])
+
+        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+        # but approximate it in 1 - proba[target class].
+        # The 1 is a constant that doesn't change the matching, it can be ommitted.
+        class_cost = -out_prob[:, tgt_ids]
+
+        # Compute the L1 cost between boxes
+        bbox_cost = torch.cdist(out_bbox, tgt_bbox, p=1)
+
+        # Compute the giou cost between boxes
+        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(tgt_bbox))
+
+        # Final cost matrix
+        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
+        cost_matrix = cost_matrix.view(bs, num_queries, -1).cpu()
+
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+
+
+# below: bounding box utilities taken from https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
+
+
+def _upcast(t: Tensor) -> Tensor:
+    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
+    if t.is_floating_point():
+        return t if t.dtype in (torch.float32, torch.float64) else t.float()
+    else:
+        return t if t.dtype in (torch.int32, torch.int64) else t.int()
+
+
+def box_area(boxes: Tensor) -> Tensor:
+    """
+    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
+
+    Args:
+        boxes (Tensor[N, 4]): boxes for which the area will be computed. They
+            are expected to be in (x1, y1, x2, y2) format with ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+
+    Returns:
+        area (Tensor[N]): area for each box
+    """
+    boxes = _upcast(boxes)
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+# modified from torchvision to also return the union
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
+
+    Returns:
+        a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    iou, union = box_iou(boxes1, boxes2)
+
+    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    area = wh[:, :, 0] * wh[:, :, 1]
+
+    return iou - (area - union) / area
+
+
+# below: taken from https://github.com/facebookresearch/detr/blob/master/util/misc.py#L306
+
+
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+
+
+class NestedTensor(object):
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+
+    def to(self, device):
+        # type: (Device) -> NestedTensor # noqa
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+
+    def decompose(self):
+        return self.tensors, self.mask
+
+    def __repr__(self):
+        return str(self.tensors)
+
+
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    if tensor_list[0].ndim == 3:
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        batch_shape = [len(tensor_list)] + max_size
+        b, c, h, w = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], : img.shape[2]] = False
+    else:
+        raise ValueError("Only 3-dimensional tensors are supported")
+    return NestedTensor(tensor, mask)
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 1e586729615..9bfb9722170 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -39,6 +39,7 @@ from .file_utils import (
     is_sentencepiece_available,
     is_soundfile_availble,
     is_tf_available,
+    is_timm_available,
     is_tokenizers_available,
     is_torch_available,
     is_torch_tpu_available,
@@ -229,6 +230,19 @@ def require_onnx(test_case):
         return test_case
 
 
+def require_timm(test_case):
+    """
+    Decorator marking a test that requires Timm.
+
+    These tests are skipped when Timm isn't installed.
+
+    """
+    if not is_timm_available():
+        return unittest.skip("test requires Timm")(test_case)
+    else:
+        return test_case
+
+
 def require_torch(test_case):
     """
     Decorator marking a test that requires PyTorch.
diff --git a/src/transformers/utils/coco_classes.py b/src/transformers/utils/coco_classes.py
new file mode 100644
index 00000000000..cc540052aef
--- /dev/null
+++ b/src/transformers/utils/coco_classes.py
@@ -0,0 +1,94 @@
+# COCO object detection id's to class names
+id2label = {
+    0: "N/A",
+    1: "person",
+    2: "bicycle",
+    3: "car",
+    4: "motorcycle",
+    5: "airplane",
+    6: "bus",
+    7: "train",
+    8: "truck",
+    9: "boat",
+    10: "traffic light",
+    11: "fire hydrant",
+    12: "N/A",
+    13: "stop sign",
+    14: "parking meter",
+    15: "bench",
+    16: "bird",
+    17: "cat",
+    18: "dog",
+    19: "horse",
+    20: "sheep",
+    21: "cow",
+    22: "elephant",
+    23: "bear",
+    24: "zebra",
+    25: "giraffe",
+    26: "N/A",
+    27: "backpack",
+    28: "umbrella",
+    29: "N/A",
+    30: "N/A",
+    31: "handbag",
+    32: "tie",
+    33: "suitcase",
+    34: "frisbee",
+    35: "skis",
+    36: "snowboard",
+    37: "sports ball",
+    38: "kite",
+    39: "baseball bat",
+    40: "baseball glove",
+    41: "skateboard",
+    42: "surfboard",
+    43: "tennis racket",
+    44: "bottle",
+    45: "N/A",
+    46: "wine glass",
+    47: "cup",
+    48: "fork",
+    49: "knife",
+    50: "spoon",
+    51: "bowl",
+    52: "banana",
+    53: "apple",
+    54: "sandwich",
+    55: "orange",
+    56: "broccoli",
+    57: "carrot",
+    58: "hot dog",
+    59: "pizza",
+    60: "donut",
+    61: "cake",
+    62: "chair",
+    63: "couch",
+    64: "potted plant",
+    65: "bed",
+    66: "N/A",
+    67: "dining table",
+    68: "N/A",
+    69: "N/A",
+    70: "toilet",
+    71: "N/A",
+    72: "tv",
+    73: "laptop",
+    74: "mouse",
+    75: "remote",
+    76: "keyboard",
+    77: "cell phone",
+    78: "microwave",
+    79: "oven",
+    80: "toaster",
+    81: "sink",
+    82: "refrigerator",
+    83: "N/A",
+    84: "book",
+    85: "clock",
+    86: "vase",
+    87: "scissors",
+    88: "teddy bear",
+    89: "hair drier",
+    90: "toothbrush",
+}
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 000a05d31df..036a0a1c5ac 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -334,6 +334,9 @@ MODEL_FOR_MULTIPLE_CHOICE_MAPPING = None
 MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING = None
 
 
+MODEL_FOR_OBJECT_DETECTION_MAPPING = None
+
+
 MODEL_FOR_PRETRAINING_MAPPING = None
 
 
diff --git a/src/transformers/utils/dummy_timm_and_vision_objects.py b/src/transformers/utils/dummy_timm_and_vision_objects.py
new file mode 100644
index 00000000000..33acdf77725
--- /dev/null
+++ b/src/transformers/utils/dummy_timm_and_vision_objects.py
@@ -0,0 +1,24 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..file_utils import requires_backends
+
+
+DETR_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class DetrForObjectDetection:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["timm", "vision"])
+
+
+class DetrForSegmentation:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["timm", "vision"])
+
+
+class DetrModel:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["timm", "vision"])
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_backends(self, ["timm", "vision"])
diff --git a/src/transformers/utils/dummy_timm_objects.py b/src/transformers/utils/dummy_timm_objects.py
new file mode 100644
index 00000000000..bc46f681553
--- /dev/null
+++ b/src/transformers/utils/dummy_timm_objects.py
@@ -0,0 +1,24 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..file_utils import requires_backends
+
+
+DETR_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class DetrForObjectDetection:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["timm"])
+
+
+class DetrForSegmentation:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["timm"])
+
+
+class DetrModel:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["timm"])
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_backends(self, ["timm"])
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 1798c9f73c8..84b37d35dfb 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -22,6 +22,11 @@ class DeiTFeatureExtractor:
         requires_backends(self, ["vision"])
 
 
+class DetrFeatureExtractor:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class ViTFeatureExtractor:
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
diff --git a/tests/fixtures/coco.jpg b/tests/fixtures/coco.jpg
deleted file mode 100644
index d32344928e3..00000000000
Binary files a/tests/fixtures/coco.jpg and /dev/null differ
diff --git a/tests/fixtures/tests_samples/.gitignore b/tests/fixtures/tests_samples/.gitignore
index f5030eb61e7..1d7141c43dc 100644
--- a/tests/fixtures/tests_samples/.gitignore
+++ b/tests/fixtures/tests_samples/.gitignore
@@ -1,4 +1,3 @@
-*.*
 cache*
 temp*
 !*.txt
diff --git a/tests/fixtures/tests_samples/COCO/cats.png b/tests/fixtures/tests_samples/COCO/000000039769.png
similarity index 100%
rename from tests/fixtures/tests_samples/COCO/cats.png
rename to tests/fixtures/tests_samples/COCO/000000039769.png
diff --git a/tests/fixtures/tests_samples/COCO/coco_annotations.txt b/tests/fixtures/tests_samples/COCO/coco_annotations.txt
new file mode 100644
index 00000000000..bd8c86a9bc3
--- /dev/null
+++ b/tests/fixtures/tests_samples/COCO/coco_annotations.txt
@@ -0,0 +1 @@
+[{"segmentation": [[333.96, 175.14, 338.26, 134.33, 342.55, 95.67, 348.99, 79.57, 368.32, 80.64, 371.54, 91.38, 364.03, 106.41, 356.51, 145.07, 351.14, 166.55, 350.07, 184.8, 345.77, 185.88, 332.89, 178.36, 332.89, 172.99]], "area": 2120.991099999999, "iscrowd": 0, "image_id": 39769, "bbox": [332.89, 79.57, 38.65, 106.31], "category_id": 75, "id": 1108446}, {"segmentation": [[44.03, 86.01, 112.75, 74.2, 173.96, 77.42, 175.03, 89.23, 170.74, 98.9, 147.11, 102.12, 54.77, 119.3, 53.69, 119.3, 44.03, 113.93, 41.88, 94.6, 41.88, 94.6]], "area": 4052.607, "iscrowd": 0, "image_id": 39769, "bbox": [41.88, 74.2, 133.15, 45.1], "category_id": 75, "id": 1110067}, {"segmentation": [[1.08, 473.53, 633.17, 473.53, 557.66, 376.45, 535.01, 366.74, 489.71, 305.26, 470.29, 318.2, 456.27, 351.64, 413.12, 363.51, 376.45, 358.11, 348.4, 350.56, 363.51, 331.15, 357.03, 288.0, 353.8, 257.8, 344.09, 190.92, 333.3, 177.98, 345.17, 79.82, 284.76, 130.52, 265.35, 151.01, 308.49, 189.84, 317.12, 215.73, 293.39, 243.78, 269.66, 212.49, 235.15, 199.55, 214.65, 193.08, 187.69, 217.89, 159.64, 278.29, 135.91, 313.89, 169.35, 292.31, 203.87, 281.53, 220.04, 292.31, 220.04, 307.42, 175.82, 345.17, 155.33, 360.27, 105.71, 363.51, 85.21, 374.29, 74.43, 366.74, 70.11, 465.98, 42.07, 471.37, 33.44, 457.35, 34.52, 414.2, 29.12, 368.9, 9.71, 291.24, 46.38, 209.26, 99.24, 128.36, 131.6, 107.87, 50.7, 117.57, 40.99, 103.55, 40.99, 85.21, 60.4, 77.66, 141.3, 70.11, 173.66, 72.27, 174.74, 92.76, 204.94, 72.27, 225.44, 62.56, 262.11, 56.09, 292.31, 53.93, 282.61, 81.98, 298.79, 96.0, 310.65, 102.47, 348.4, 74.43, 373.21, 81.98, 430.38, 35.6, 484.31, 23.73, 540.4, 46.38, 593.26, 66.88, 638.56, 80.9, 632.09, 145.62, 581.39, 118.65, 543.64, 130.52, 533.93, 167.19, 512.36, 197.39, 498.34, 218.97, 529.62, 253.48, 549.03, 273.98, 584.63, 276.13, 587.87, 293.39, 566.29, 305.26, 531.78, 298.79, 549.03, 319.28, 576.0, 358.11, 560.9, 376.45, 639.64, 471.37, 639.64, 2.16, 1.08, 0.0]], "area": 176277.55269999994, "iscrowd": 0, "image_id": 39769, "bbox": [1.08, 0.0, 638.56, 473.53], "category_id": 63, "id": 1605237}, {"segmentation": [[1.07, 1.18, 640.0, 3.33, 638.93, 472.59, 4.3, 479.03]], "area": 301552.6694999999, "iscrowd": 0, "image_id": 39769, "bbox": [1.07, 1.18, 638.93, 477.85], "category_id": 65, "id": 1612051}, {"segmentation": [[138.75, 319.38, 148.75, 294.38, 165.0, 246.87, 197.5, 205.63, 247.5, 203.13, 268.75, 216.88, 280.0, 239.38, 293.75, 244.38, 303.75, 241.88, 307.5, 228.13, 318.75, 220.63, 315.0, 200.63, 291.25, 171.88, 265.0, 156.88, 258.75, 148.13, 262.5, 135.63, 282.5, 123.13, 292.5, 115.63, 311.25, 108.13, 313.75, 106.88, 296.25, 93.13, 282.5, 84.38, 292.5, 64.38, 288.75, 60.63, 266.25, 54.38, 232.5, 63.12, 206.25, 70.63, 170.0, 100.63, 136.25, 114.38, 101.25, 138.13, 56.25, 194.38, 27.5, 259.38, 17.5, 299.38, 32.5, 378.13, 31.25, 448.13, 41.25, 469.38, 66.25, 466.88, 70.0, 419.38, 71.25, 391.88, 77.5, 365.63, 113.75, 364.38, 145.0, 360.63, 168.75, 349.38, 191.25, 330.63, 212.5, 319.38, 223.75, 305.63, 206.25, 286.88, 172.5, 288.13]], "area": 53301.618749999994, "iscrowd": 0, "image_id": 39769, "bbox": [17.5, 54.38, 301.25, 415.0], "category_id": 17, "id": 2190839}, {"segmentation": [[543.75, 136.88, 570.0, 114.38, 591.25, 123.13, 616.25, 140.63, 640.0, 143.13, 636.25, 124.37, 605.0, 103.13, 640.0, 103.13, 633.75, 86.88, 587.5, 73.13, 548.75, 49.38, 505.0, 35.63, 462.5, 25.63, 405.0, 48.13, 362.5, 111.88, 347.5, 179.38, 355.0, 220.63, 356.25, 230.63, 365.0, 264.38, 358.75, 266.88, 358.75, 270.63, 356.25, 291.88, 356.25, 325.63, 355.0, 338.13, 350.0, 348.13, 365.0, 354.38, 396.25, 351.88, 423.75, 355.63, 446.25, 350.63, 460.0, 345.63, 462.5, 321.88, 468.75, 306.88, 481.25, 299.38, 516.25, 341.88, 536.25, 368.13, 570.0, 369.38, 578.75, 359.38, 555.0, 330.63, 532.5, 298.13, 563.75, 299.38, 582.5, 298.13, 586.25, 286.88, 578.75, 278.13, 548.75, 269.38, 525.0, 256.88, 505.0, 206.88, 536.25, 161.88, 540.0, 149.38]], "area": 59700.95625, "iscrowd": 0, "image_id": 39769, "bbox": [347.5, 25.63, 292.5, 343.75], "category_id": 17, "id": 2190842}]
\ No newline at end of file
diff --git a/tests/fixtures/tests_samples/COCO/coco_panoptic/000000039769.png b/tests/fixtures/tests_samples/COCO/coco_panoptic/000000039769.png
new file mode 100644
index 00000000000..9dc23525d6e
Binary files /dev/null and b/tests/fixtures/tests_samples/COCO/coco_panoptic/000000039769.png differ
diff --git a/tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt b/tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt
new file mode 100644
index 00000000000..90a9798be2a
--- /dev/null
+++ b/tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt
@@ -0,0 +1 @@
+[{"id": 8222595, "category_id": 17, "iscrowd": 0, "bbox": [18, 54, 301, 415], "area": 53306}, {"id": 8225432, "category_id": 17, "iscrowd": 0, "bbox": [349, 26, 291, 343], "area": 59627}, {"id": 8798150, "category_id": 63, "iscrowd": 0, "bbox": [1, 0, 639, 474], "area": 174579}, {"id": 14466198, "category_id": 75, "iscrowd": 0, "bbox": [42, 74, 133, 45], "area": 4068}, {"id": 12821912, "category_id": 75, "iscrowd": 0, "bbox": [333, 80, 38, 106], "area": 2118}, {"id": 10898909, "category_id": 93, "iscrowd": 0, "bbox": [0, 0, 640, 480], "area": 2750}]
\ No newline at end of file
diff --git a/tests/test_feature_extraction_common.py b/tests/test_feature_extraction_common.py
index 49dfa6dfd4d..217da135ca1 100644
--- a/tests/test_feature_extraction_common.py
+++ b/tests/test_feature_extraction_common.py
@@ -18,6 +18,57 @@ import json
 import os
 import tempfile
 
+from transformers.file_utils import is_torch_available, is_vision_available
+
+
+if is_torch_available():
+    import numpy as np
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+
+def prepare_image_inputs(feature_extract_tester, equal_resolution=False, numpify=False, torchify=False):
+    """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+    or a list of PyTorch tensors if one specifies torchify=True.
+    """
+
+    assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
+
+    if equal_resolution:
+        image_inputs = []
+        for i in range(feature_extract_tester.batch_size):
+            image_inputs.append(
+                np.random.randint(
+                    255,
+                    size=(
+                        feature_extract_tester.num_channels,
+                        feature_extract_tester.max_resolution,
+                        feature_extract_tester.max_resolution,
+                    ),
+                    dtype=np.uint8,
+                )
+            )
+    else:
+        image_inputs = []
+        for i in range(feature_extract_tester.batch_size):
+            width, height = np.random.choice(
+                np.arange(feature_extract_tester.min_resolution, feature_extract_tester.max_resolution), 2
+            )
+            image_inputs.append(
+                np.random.randint(255, size=(feature_extract_tester.num_channels, width, height), dtype=np.uint8)
+            )
+
+    if not numpify and not torchify:
+        # PIL expects the channel dimension as last dimension
+        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+
+    if torchify:
+        image_inputs = [torch.from_numpy(x) for x in image_inputs]
+
+    return image_inputs
+
 
 class FeatureExtractionSavingTestMixin:
     def test_feat_extract_to_json_string(self):
diff --git a/tests/test_feature_extraction_deit.py b/tests/test_feature_extraction_deit.py
index a2b60eafe6e..dc86074dc98 100644
--- a/tests/test_feature_extraction_deit.py
+++ b/tests/test_feature_extraction_deit.py
@@ -21,7 +21,7 @@ import numpy as np
 from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision
 
-from .test_feature_extraction_common import FeatureExtractionSavingTestMixin
+from .test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
 
 
 if is_torch_available():
@@ -75,36 +75,6 @@ class DeiTFeatureExtractionTester(unittest.TestCase):
             "image_std": self.image_std,
         }
 
-    def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-        or a list of PyTorch tensors if one specifies torchify=True.
-        """
-
-        assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
-
-        if equal_resolution:
-            image_inputs = []
-            for i in range(self.batch_size):
-                image_inputs.append(
-                    np.random.randint(
-                        255, size=(self.num_channels, self.max_resolution, self.max_resolution), dtype=np.uint8
-                    )
-                )
-        else:
-            image_inputs = []
-            for i in range(self.batch_size):
-                width, height = np.random.choice(np.arange(self.min_resolution, self.max_resolution), 2)
-                image_inputs.append(np.random.randint(255, size=(self.num_channels, width, height), dtype=np.uint8))
-
-        if not numpify and not torchify:
-            # PIL expects the channel dimension as last dimension
-            image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
-
-        if torchify:
-            image_inputs = [torch.from_numpy(x) for x in image_inputs]
-
-        return image_inputs
-
 
 @require_torch
 @require_vision
@@ -136,7 +106,7 @@ class DeiTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestC
         # Initialize feature_extractor
         feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
         # create random PIL images
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False)
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
         for image in image_inputs:
             self.assertIsInstance(image, Image.Image)
 
@@ -168,7 +138,7 @@ class DeiTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestC
         # Initialize feature_extractor
         feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
         # create random numpy tensors
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, numpify=True)
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
         for image in image_inputs:
             self.assertIsInstance(image, np.ndarray)
 
@@ -200,7 +170,7 @@ class DeiTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestC
         # Initialize feature_extractor
         feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
         # create random PyTorch tensors
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, torchify=True)
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
         for image in image_inputs:
             self.assertIsInstance(image, torch.Tensor)
 
diff --git a/tests/test_feature_extraction_detr.py b/tests/test_feature_extraction_detr.py
new file mode 100644
index 00000000000..8f36ad418f5
--- /dev/null
+++ b/tests/test_feature_extraction_detr.py
@@ -0,0 +1,339 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import pathlib
+import unittest
+
+import numpy as np
+
+from transformers.file_utils import is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_vision, slow
+
+from .test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import DetrFeatureExtractor
+
+
+class DetrFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=18,
+        max_size=1333,  # by setting max_size > max_resolution we're effectively not testing this :p
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.max_size = max_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "max_size": self.max_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+        }
+
+    def get_expected_values(self, image_inputs, batched=False):
+        """
+        This function computes the expected height and width when providing images to DetrFeatureExtractor,
+        assuming do_resize is set to True with a scalar size.
+        """
+        if not batched:
+            image = image_inputs[0]
+            if isinstance(image, Image.Image):
+                w, h = image.size
+            else:
+                h, w = image.shape[1], image.shape[2]
+            if w < h:
+                expected_height = int(self.size * h / w)
+                expected_width = self.size
+            elif w > h:
+                expected_height = self.size
+                expected_width = int(self.size * w / h)
+            else:
+                expected_height = self.size
+                expected_width = self.size
+
+        else:
+            expected_values = []
+            for image in image_inputs:
+                expected_height, expected_width = self.get_expected_values([image])
+                expected_values.append((expected_height, expected_width))
+            expected_height = max(expected_values, key=lambda item: item[0])[0]
+            expected_width = max(expected_values, key=lambda item: item[1])[1]
+
+        return expected_height, expected_width
+
+
+@require_torch
+@require_vision
+class DetrFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
+
+    feature_extraction_class = DetrFeatureExtractor if is_vision_available() else None
+
+    def setUp(self):
+        self.feature_extract_tester = DetrFeatureExtractionTester(self)
+
+    @property
+    def feat_extract_dict(self):
+        return self.feature_extract_tester.prepare_feat_extract_dict()
+
+    def test_feat_extract_properties(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        self.assertTrue(hasattr(feature_extractor, "image_mean"))
+        self.assertTrue(hasattr(feature_extractor, "image_std"))
+        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
+        self.assertTrue(hasattr(feature_extractor, "do_resize"))
+        self.assertTrue(hasattr(feature_extractor, "size"))
+        self.assertTrue(hasattr(feature_extractor, "max_size"))
+
+    def test_batch_feature(self):
+        pass
+
+    def test_call_pil(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PIL images
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
+        )
+
+        # Test batched
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
+
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+    def test_call_numpy(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random numpy tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+    def test_call_pytorch(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PyTorch tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+    def test_equivalence_pad_and_create_pixel_mask(self):
+        # Initialize feature_extractors
+        feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict)
+        feature_extractor_2 = self.feature_extraction_class(do_resize=False, do_normalize=False)
+        # create random PyTorch tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test whether the method "pad_and_return_pixel_mask" and calling the feature extractor return the same tensors
+        encoded_images_with_method = feature_extractor_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
+        encoded_images = feature_extractor_2(image_inputs, return_tensors="pt")
+
+        assert torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
+        assert torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
+
+    @slow
+    def test_call_pytorch_with_coco_detection_annotations(self):
+        # prepare image and target
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"image_id": 39769, "annotations": target}
+
+        # encode them
+        # TODO replace by facebook/detr-resnet-50
+        feature_extractor = DetrFeatureExtractor.from_pretrained("nielsr/detr-resnet-50")
+        encoding = feature_extractor(images=image, annotations=target, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)
+
+        # verify area
+        expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
+        assert torch.allclose(encoding["target"][0]["area"], expected_area)
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["target"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
+        assert torch.allclose(encoding["target"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        assert torch.allclose(encoding["target"][0]["image_id"], expected_image_id)
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        assert torch.allclose(encoding["target"][0]["iscrowd"], expected_is_crowd)
+        # verify class_labels
+        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
+        assert torch.allclose(encoding["target"][0]["class_labels"], expected_class_labels)
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        assert torch.allclose(encoding["target"][0]["orig_size"], expected_orig_size)
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        assert torch.allclose(encoding["target"][0]["size"], expected_size)
+
+    @slow
+    def test_call_pytorch_with_coco_panoptic_annotations(self):
+        # prepare image, target and masks_path
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
+
+        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
+
+        # encode them
+        # TODO replace by .from_pretrained facebook/detr-resnet-50-panoptic
+        feature_extractor = DetrFeatureExtractor(format="coco_panoptic")
+        encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)
+
+        # verify area
+        expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
+        assert torch.allclose(encoding["target"][0]["area"], expected_area)
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["target"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
+        assert torch.allclose(encoding["target"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        assert torch.allclose(encoding["target"][0]["image_id"], expected_image_id)
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        assert torch.allclose(encoding["target"][0]["iscrowd"], expected_is_crowd)
+        # verify class_labels
+        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
+        assert torch.allclose(encoding["target"][0]["class_labels"], expected_class_labels)
+        # verify masks
+        expected_masks_sum = 822338
+        self.assertEqual(encoding["target"][0]["masks"].sum().item(), expected_masks_sum)
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        assert torch.allclose(encoding["target"][0]["orig_size"], expected_orig_size)
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        assert torch.allclose(encoding["target"][0]["size"], expected_size)
diff --git a/tests/test_feature_extraction_vit.py b/tests/test_feature_extraction_vit.py
index 5c8db9baa63..283a94d8ac1 100644
--- a/tests/test_feature_extraction_vit.py
+++ b/tests/test_feature_extraction_vit.py
@@ -21,7 +21,7 @@ import numpy as np
 from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision
 
-from .test_feature_extraction_common import FeatureExtractionSavingTestMixin
+from .test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
 
 
 if is_torch_available():
@@ -69,36 +69,6 @@ class ViTFeatureExtractionTester(unittest.TestCase):
             "size": self.size,
         }
 
-    def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-        or a list of PyTorch tensors if one specifies torchify=True.
-        """
-
-        assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
-
-        if equal_resolution:
-            image_inputs = []
-            for i in range(self.batch_size):
-                image_inputs.append(
-                    np.random.randint(
-                        255, size=(self.num_channels, self.max_resolution, self.max_resolution), dtype=np.uint8
-                    )
-                )
-        else:
-            image_inputs = []
-            for i in range(self.batch_size):
-                width, height = np.random.choice(np.arange(self.min_resolution, self.max_resolution), 2)
-                image_inputs.append(np.random.randint(255, size=(self.num_channels, width, height), dtype=np.uint8))
-
-        if not numpify and not torchify:
-            # PIL expects the channel dimension as last dimension
-            image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
-
-        if torchify:
-            image_inputs = [torch.from_numpy(x) for x in image_inputs]
-
-        return image_inputs
-
 
 @require_torch
 @require_vision
@@ -128,7 +98,7 @@ class ViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCa
         # Initialize feature_extractor
         feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
         # create random PIL images
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False)
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
         for image in image_inputs:
             self.assertIsInstance(image, Image.Image)
 
@@ -160,7 +130,7 @@ class ViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCa
         # Initialize feature_extractor
         feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
         # create random numpy tensors
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, numpify=True)
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
         for image in image_inputs:
             self.assertIsInstance(image, np.ndarray)
 
@@ -192,7 +162,7 @@ class ViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCa
         # Initialize feature_extractor
         feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
         # create random PyTorch tensors
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, torchify=True)
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
         for image in image_inputs:
             self.assertIsInstance(image, torch.Tensor)
 
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 7223bfa5376..272f25a0ecf 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -21,7 +21,7 @@ import random
 import tempfile
 import unittest
 import warnings
-from typing import List, Tuple
+from typing import Dict, List, Tuple
 
 from huggingface_hub import HfApi
 from requests.exceptions import HTTPError
@@ -982,7 +982,6 @@ class ModelTesterMixin:
 
         outputs = model(**inputs)
 
-        print(outputs)
         output = outputs[0]
 
         if config.is_encoder_decoder:
@@ -1236,6 +1235,11 @@ class ModelTesterMixin:
                     if isinstance(tuple_object, (List, Tuple)):
                         for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
                             recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif isinstance(tuple_object, Dict):
+                        for tuple_iterable_value, dict_iterable_value in zip(
+                            tuple_object.values(), dict_object.values()
+                        ):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
                     elif tuple_object is None:
                         return
                     else:
diff --git a/tests/test_modeling_deit.py b/tests/test_modeling_deit.py
index d4d95f0b491..5551da08903 100644
--- a/tests/test_modeling_deit.py
+++ b/tests/test_modeling_deit.py
@@ -360,7 +360,7 @@ class DeiTModelTest(ModelTesterMixin, unittest.TestCase):
 
 # We will verify our results on an image of cute cats
 def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/cats.png")
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
     return image
 
 
diff --git a/tests/test_modeling_detr.py b/tests/test_modeling_detr.py
new file mode 100644
index 00000000000..093e75cf993
--- /dev/null
+++ b/tests/test_modeling_detr.py
@@ -0,0 +1,527 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch DETR model. """
+
+
+import inspect
+import math
+import unittest
+
+from transformers import is_timm_available, is_vision_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_timm, require_vision, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+
+
+if is_timm_available():
+    import torch
+
+    from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrModel
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import DetrFeatureExtractor
+
+
+@require_timm
+class DetrModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=8,
+        is_training=True,
+        use_labels=True,
+        hidden_size=256,
+        num_hidden_layers=2,
+        num_attention_heads=8,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        num_queries=12,
+        num_channels=3,
+        min_size=200,
+        max_size=200,
+        n_targets=8,
+        num_labels=91,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.num_queries = num_queries
+        self.num_channels = num_channels
+        self.min_size = min_size
+        self.max_size = max_size
+        self.n_targets = n_targets
+        self.num_labels = num_labels
+
+        # we also set the expected seq length for both encoder and decoder
+        self.encoder_seq_length = math.ceil(self.min_size / 32) * math.ceil(self.max_size / 32)
+        self.decoder_seq_length = self.num_queries
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.min_size, self.max_size])
+
+        pixel_mask = torch.ones([self.batch_size, self.min_size, self.max_size], device=torch_device)
+
+        labels = None
+        if self.use_labels:
+            # labels is a list of Dict (each Dict being the labels for a given example in the batch)
+            labels = []
+            for i in range(self.batch_size):
+                target = {}
+                target["class_labels"] = torch.randint(
+                    high=self.num_labels, size=(self.n_targets,), device=torch_device
+                )
+                target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device)
+                target["masks"] = torch.rand(self.n_targets, self.min_size, self.max_size, device=torch_device)
+                labels.append(target)
+
+        config = DetrConfig(
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            num_queries=self.num_queries,
+            num_labels=self.num_labels,
+        )
+        return config, pixel_values, pixel_mask, labels
+
+    def prepare_config_and_inputs_for_common(self):
+        config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs()
+        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
+        return config, inputs_dict
+
+    def create_and_check_detr_model(self, config, pixel_values, pixel_mask, labels):
+        model = DetrModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(pixel_values)
+
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.decoder_seq_length, self.hidden_size)
+        )
+
+    def create_and_check_detr_object_detection_head_model(self, config, pixel_values, pixel_mask, labels):
+        model = DetrForObjectDetection(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(pixel_values)
+
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1))
+        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
+
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1))
+        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
+
+
+@require_timm
+class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            DetrModel,
+            DetrForObjectDetection,
+            DetrForSegmentation,
+        )
+        if is_timm_available()
+        else ()
+    )
+    is_encoder_decoder = True
+    test_torchscript = False
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
+
+    # special case for head models
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class.__name__ in ["DetrForObjectDetection", "DetrForSegmentation"]:
+                labels = []
+                for i in range(self.model_tester.batch_size):
+                    target = {}
+                    target["class_labels"] = torch.ones(
+                        size=(self.model_tester.n_targets,), device=torch_device, dtype=torch.long
+                    )
+                    target["boxes"] = torch.ones(
+                        self.model_tester.n_targets, 4, device=torch_device, dtype=torch.float
+                    )
+                    target["masks"] = torch.ones(
+                        self.model_tester.n_targets,
+                        self.model_tester.min_size,
+                        self.model_tester.max_size,
+                        device=torch_device,
+                        dtype=torch.float,
+                    )
+                    labels.append(target)
+                inputs_dict["labels"] = labels
+
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = DetrModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=DetrConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_detr_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_detr_model(*config_and_inputs)
+
+    def test_detr_object_detection_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_detr_object_detection_head_model(*config_and_inputs)
+
+    @unittest.skip(reason="DETR does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="DETR does not have a get_input_embeddings method")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="DETR is not a generative model")
+    def test_generate_without_input_ids(self):
+        pass
+
+    @unittest.skip(reason="DETR does not use token embeddings")
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        decoder_seq_length = self.model_tester.decoder_seq_length
+        encoder_seq_length = self.model_tester.encoder_seq_length
+        decoder_key_length = self.model_tester.decoder_seq_length
+        encoder_key_length = self.model_tester.encoder_seq_length
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
+            out_len = len(outputs)
+
+            if self.is_encoder_decoder:
+                correct_outlen = 5
+
+                # loss is at first position
+                if "labels" in inputs_dict:
+                    correct_outlen += 1  # loss is added to beginning
+                # Object Detection model returns pred_logits and pred_boxes
+                if model_class.__name__ == "DetrForObjectDetection":
+                    correct_outlen += 2
+                # Panoptic Segmentation model returns pred_logits, pred_boxes, pred_masks
+                if model_class.__name__ == "DetrForSegmentation":
+                    correct_outlen += 3
+                if "past_key_values" in outputs:
+                    correct_outlen += 1  # past_key_values have been returned
+
+                self.assertEqual(out_len, correct_outlen)
+
+                # decoder attentions
+                decoder_attentions = outputs.decoder_attentions
+                self.assertIsInstance(decoder_attentions, (list, tuple))
+                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(decoder_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+                )
+
+                # cross attentions
+                cross_attentions = outputs.cross_attentions
+                self.assertIsInstance(cross_attentions, (list, tuple))
+                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(cross_attentions[0].shape[-3:]),
+                    [
+                        self.model_tester.num_attention_heads,
+                        decoder_seq_length,
+                        encoder_key_length,
+                    ],
+                )
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            elif self.is_encoder_decoder:
+                added_hidden_states = 2
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        inputs = self._prepare_for_class(inputs_dict, model_class)
+
+        outputs = model(**inputs)
+
+        output = outputs[0]
+
+        encoder_hidden_states = outputs.encoder_hidden_states[0]
+        encoder_attentions = outputs.encoder_attentions[0]
+        encoder_hidden_states.retain_grad()
+        encoder_attentions.retain_grad()
+
+        decoder_attentions = outputs.decoder_attentions[0]
+        decoder_attentions.retain_grad()
+
+        cross_attentions = outputs.cross_attentions[0]
+        cross_attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(encoder_hidden_states.grad)
+        self.assertIsNotNone(encoder_attentions.grad)
+        self.assertIsNotNone(decoder_attentions.grad)
+        self.assertIsNotNone(cross_attentions.grad)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            if model.config.is_encoder_decoder:
+                expected_arg_names = ["pixel_values", "pixel_mask"]
+                expected_arg_names.extend(
+                    ["head_mask", "decoder_head_mask", "encoder_outputs"]
+                    if "head_mask" and "decoder_head_mask" in arg_names
+                    else []
+                )
+                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+            else:
+                expected_arg_names = ["pixel_values", "pixel_mask"]
+                self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_different_timm_backbone(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # let's pick a random timm backbone
+        config.backbone = "tf_mobilenetv3_small_075"
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if model_class.__name__ == "DetrForObjectDetection":
+                expected_shape = (
+                    self.model_tester.batch_size,
+                    self.model_tester.num_queries,
+                    self.model_tester.num_labels + 1,
+                )
+                self.assertEqual(outputs.logits.shape, expected_shape)
+
+            self.assertTrue(outputs)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        configs_no_init.init_xavier_std = 1e9
+
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    if "bbox_attention" in name and "bias" not in name:
+                        self.assertLess(
+                            100000,
+                            abs(param.data.max().item()),
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+
+TOLERANCE = 1e-4
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_timm
+@require_vision
+@slow
+class DetrModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def default_feature_extractor(self):
+        return DetrFeatureExtractor.from_pretrained("facebook/detr-resnet-50") if is_vision_available() else None
+
+    def test_inference_no_head(self):
+        model = DetrModel.from_pretrained("facebook/detr-resnet-50").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**encoding)
+
+        expected_shape = torch.Size((1, 100, 256))
+        assert outputs.last_hidden_state.shape == expected_shape
+        expected_slice = torch.tensor(
+            [[0.0616, -0.5146, -0.4032], [-0.7629, -0.4934, -1.7153], [-0.4768, -0.6403, -0.7826]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
+
+    def test_inference_object_detection_head(self):
+        model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        pixel_values = encoding["pixel_values"].to(torch_device)
+        pixel_mask = encoding["pixel_mask"].to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(pixel_values, pixel_mask)
+
+        expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels + 1))
+        self.assertEqual(outputs.logits.shape, expected_shape_logits)
+        expected_slice_logits = torch.tensor(
+            [[-19.1194, -0.0893, -11.0154], [-17.3640, -1.8035, -14.0219], [-20.0461, -0.5837, -11.1060]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4))
+
+        expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
+        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
+        expected_slice_boxes = torch.tensor(
+            [[0.4433, 0.5302, 0.8853], [0.5494, 0.2517, 0.0529], [0.4998, 0.5360, 0.9956]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
+
+    def test_inference_panoptic_segmentation_head(self):
+        model = DetrForSegmentation.from_pretrained("facebook/detr-resnet-50-panoptic").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        pixel_values = encoding["pixel_values"].to(torch_device)
+        pixel_mask = encoding["pixel_mask"].to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(pixel_values, pixel_mask)
+
+        expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels + 1))
+        self.assertEqual(outputs.logits.shape, expected_shape_logits)
+        expected_slice_logits = torch.tensor(
+            [[-18.1565, -1.7568, -13.5029], [-16.8888, -1.4138, -14.1028], [-17.5709, -2.5080, -11.8654]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4))
+
+        expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
+        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
+        expected_slice_boxes = torch.tensor(
+            [[0.5344, 0.1789, 0.9285], [0.4420, 0.0572, 0.0875], [0.6630, 0.6887, 0.1017]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
+
+        expected_shape_masks = torch.Size((1, model.config.num_queries, 200, 267))
+        self.assertEqual(outputs.pred_masks.shape, expected_shape_masks)
+        expected_slice_masks = torch.tensor(
+            [[-7.7558, -10.8788, -11.9797], [-11.8881, -16.4329, -17.7451], [-14.7316, -19.7383, -20.3004]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.pred_masks[0, 0, :3, :3], expected_slice_masks, atol=1e-4))
diff --git a/tests/test_modeling_vit.py b/tests/test_modeling_vit.py
index b5436b7dc0e..09d4fa372a5 100644
--- a/tests/test_modeling_vit.py
+++ b/tests/test_modeling_vit.py
@@ -322,7 +322,7 @@ class ViTModelTest(ModelTesterMixin, unittest.TestCase):
 
 # We will verify our results on an image of cute cats
 def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/cats.png")
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
     return image
 
 
diff --git a/tests/test_pipelines_image_classification.py b/tests/test_pipelines_image_classification.py
index 32b13174613..ecfab4c76dd 100644
--- a/tests/test_pipelines_image_classification.py
+++ b/tests/test_pipelines_image_classification.py
@@ -47,11 +47,26 @@ class ImageClassificationPipelineTests(unittest.TestCase):
                 "http://images.cocodataset.org/val2017/000000039769.jpg",
             ]
         },
-        {"images": "tests/fixtures/coco.jpg"},
-        {"images": ["tests/fixtures/coco.jpg", "tests/fixtures/coco.jpg"]},
-        {"images": Image.open("tests/fixtures/coco.jpg")},
-        {"images": [Image.open("tests/fixtures/coco.jpg"), Image.open("tests/fixtures/coco.jpg")]},
-        {"images": [Image.open("tests/fixtures/coco.jpg"), "tests/fixtures/coco.jpg"]},
+        {"images": "./tests/fixtures/tests_samples/COCO/000000039769.png"},
+        {
+            "images": [
+                "./tests/fixtures/tests_samples/COCO/000000039769.png",
+                "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            ]
+        },
+        {"images": Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")},
+        {
+            "images": [
+                Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+                Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+            ]
+        },
+        {
+            "images": [
+                Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+                "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            ]
+        },
     ]
 
     def test_small_model_from_factory(self):
diff --git a/utils/check_repo.py b/utils/check_repo.py
index e0eed1dbe80..63499fe5f8d 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -38,6 +38,9 @@ IGNORE_NON_TESTED = [
     "BigBirdPegasusEncoder",  # Building part of bigger (tested) model.
     "BigBirdPegasusDecoder",  # Building part of bigger (tested) model.
     "BigBirdPegasusDecoderWrapper",  # Building part of bigger (tested) model.
+    "DetrEncoder",  # Building part of bigger (tested) model.
+    "DetrDecoder",  # Building part of bigger (tested) model.
+    "DetrDecoderWrapper",  # Building part of bigger (tested) model.
     "M2M100Encoder",  # Building part of bigger (tested) model.
     "M2M100Decoder",  # Building part of bigger (tested) model.
     "Speech2TextEncoder",  # Building part of bigger (tested) model.
@@ -95,6 +98,7 @@ IGNORE_NON_AUTO_CONFIGURED = [
     "CLIPVisionModel",
     "FlaxCLIPTextModel",
     "FlaxCLIPVisionModel",
+    "DetrForSegmentation",
     "DPRReader",
     "DPRSpanPredictor",
     "FlaubertForQuestionAnswering",