diff --git a/.circleci/config.yml b/.circleci/config.yml index 93b9e675f16..a393837806e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -139,7 +139,7 @@ jobs: - v0.4-{{ checksum "setup.py" }} - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev - run: pip install --upgrade pip - - run: pip install .[sklearn,torch,testing,sentencepiece,speech,vision] + - run: pip install .[sklearn,torch,testing,sentencepiece,speech,vision,timm] - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html - save_cache: key: v0.4-torch-{{ checksum "setup.py" }} diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml index 439822e068c..06618d08e8d 100644 --- a/.github/workflows/self-push.yml +++ b/.github/workflows/self-push.yml @@ -37,7 +37,7 @@ jobs: run: | apt -y update && apt install -y libsndfile1-dev pip install --upgrade pip - pip install .[sklearn,testing,onnxruntime,sentencepiece,speech] + pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm] - name: Are GPUs recognized by our DL frameworks run: | @@ -121,7 +121,7 @@ jobs: run: | apt -y update && apt install -y libsndfile1-dev pip install --upgrade pip - pip install .[sklearn,testing,onnxruntime,sentencepiece,speech] + pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm] - name: Are GPUs recognized by our DL frameworks run: | diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index b6a3d65bee1..f98215a62c4 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -33,7 +33,7 @@ jobs: run: | apt -y update && apt install -y libsndfile1-dev pip install --upgrade pip - pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,integrations] + pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm] - name: Are GPUs recognized by our DL frameworks run: | @@ -155,7 +155,7 @@ jobs: run: | apt -y update && apt install -y libsndfile1-dev pip install --upgrade pip - pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,integrations] + pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm] - name: Are GPUs recognized by our DL frameworks run: | diff --git a/README.md b/README.md index 37e1ee96433..7bd67b49def 100644 --- a/README.md +++ b/README.md @@ -215,6 +215,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[DeBERTa](https://huggingface.co/transformers/model_doc/deberta.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. 1. **[DeBERTa-v2](https://huggingface.co/transformers/model_doc/deberta_v2.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. 1. **[DeiT](https://huggingface.co/transformers/model_doc/deit.html)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. +1. **[DETR](https://huggingface.co/transformers/model_doc/detr.html)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko. 1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. 1. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT. 1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval diff --git a/docs/source/community.md b/docs/source/community.md index 38affbf1e68..206d8b4cf58 100644 --- a/docs/source/community.md +++ b/docs/source/community.md @@ -59,3 +59,5 @@ This page regroups resources around 🤗 Transformers developed by the community | [Evaluate LUKE on CoNLL-2003, an important NER benchmark](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | How to evaluate *LukeForEntitySpanClassification* on the CoNLL-2003 dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | | [Evaluate BigBird-Pegasus on PubMed dataset](https://github.com/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | How to evaluate *BigBirdPegasusForConditionalGeneration* on PubMed dataset | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | | [Speech Emotion Classification with Wav2Vec2](https://github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | How to leverage a pretrained Wav2Vec2 model for Emotion Classification on the MEGA dataset | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | +| [Detect objects in an image with DETR](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | How to use a trained *DetrForObjectDetection* model to detect objects in an image and visualize attention | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | +| [Fine-tune DETR on a custom object detection dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | How to fine-tune *DetrForObjectDetection* on a custom object detection dataset | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | diff --git a/docs/source/index.rst b/docs/source/index.rst index 5f51bf819a1..9e2d093eb8a 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -153,128 +153,131 @@ Supported models 19. :doc:`DeiT ` (from Facebook) released with the paper `Training data-efficient image transformers & distillation through attention `__ by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. -20. :doc:`DialoGPT ` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale +20. :doc:`DETR ` (from Facebook) released with the paper `End-to-End Object Detection with Transformers + `__ by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, + Alexander Kirillov, Sergey Zagoruyko. +21. :doc:`DialoGPT ` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation `__ by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. -21. :doc:`DistilBERT ` (from HuggingFace), released together with the paper `DistilBERT, a +22. :doc:`DistilBERT ` (from HuggingFace), released together with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter `__ by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2 `__, RoBERTa into `DistilRoBERTa `__, Multilingual BERT into `DistilmBERT `__ and a German version of DistilBERT. -22. :doc:`DPR ` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain +23. :doc:`DPR ` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain Question Answering `__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. -23. :doc:`ELECTRA ` (from Google Research/Stanford University) released with the paper `ELECTRA: +24. :doc:`ELECTRA ` (from Google Research/Stanford University) released with the paper `ELECTRA: Pre-training text encoders as discriminators rather than generators `__ by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. -24. :doc:`FlauBERT ` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model +25. :doc:`FlauBERT ` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model Pre-training for French `__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. -25. :doc:`Funnel Transformer ` (from CMU/Google Brain) released with the paper `Funnel-Transformer: +26. :doc:`Funnel Transformer ` (from CMU/Google Brain) released with the paper `Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing `__ by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. -26. :doc:`GPT ` (from OpenAI) released with the paper `Improving Language Understanding by Generative +27. :doc:`GPT ` (from OpenAI) released with the paper `Improving Language Understanding by Generative Pre-Training `__ by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. -27. :doc:`GPT-2 ` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask +28. :doc:`GPT-2 ` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask Learners `__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**. -28. :doc:`GPT Neo ` (from EleutherAI) released in the repository `EleutherAI/gpt-neo +29. :doc:`GPT Neo ` (from EleutherAI) released in the repository `EleutherAI/gpt-neo `__ by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. -29. :doc:`I-BERT ` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization +30. :doc:`I-BERT ` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization `__ by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer -30. :doc:`LayoutLM ` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training +31. :doc:`LayoutLM ` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training of Text and Layout for Document Image Understanding `__ by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou. -31. :doc:`LED ` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer +32. :doc:`LED ` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer `__ by Iz Beltagy, Matthew E. Peters, Arman Cohan. -32. :doc:`Longformer ` (from AllenAI) released with the paper `Longformer: The Long-Document +33. :doc:`Longformer ` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer `__ by Iz Beltagy, Matthew E. Peters, Arman Cohan. -33. :doc:`LUKE ` (from Studio Ousia) released with the paper `LUKE: Deep Contextualized Entity +34. :doc:`LUKE ` (from Studio Ousia) released with the paper `LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention `__ by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto. -34. :doc:`LXMERT ` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality +35. :doc:`LXMERT ` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering `__ by Hao Tan and Mohit Bansal. -35. :doc:`M2M100 ` (from Facebook) released with the paper `Beyond English-Centric Multilingual +36. :doc:`M2M100 ` (from Facebook) released with the paper `Beyond English-Centric Multilingual Machine Translation `__ by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin. -36. :doc:`MarianMT ` Machine translation models trained using `OPUS `__ data by +37. :doc:`MarianMT ` Machine translation models trained using `OPUS `__ data by Jörg Tiedemann. The `Marian Framework `__ is being developed by the Microsoft Translator Team. -37. :doc:`MBart ` (from Facebook) released with the paper `Multilingual Denoising Pre-training for +38. :doc:`MBart ` (from Facebook) released with the paper `Multilingual Denoising Pre-training for Neural Machine Translation `__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer. -38. :doc:`MBart-50 ` (from Facebook) released with the paper `Multilingual Translation with Extensible +39. :doc:`MBart-50 ` (from Facebook) released with the paper `Multilingual Translation with Extensible Multilingual Pretraining and Finetuning `__ by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan. -39. :doc:`Megatron-BERT ` (from NVIDIA) released with the paper `Megatron-LM: Training +40. :doc:`Megatron-BERT ` (from NVIDIA) released with the paper `Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism `__ by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. -40. :doc:`Megatron-GPT2 ` (from NVIDIA) released with the paper `Megatron-LM: Training +41. :doc:`Megatron-GPT2 ` (from NVIDIA) released with the paper `Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism `__ by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. -41. :doc:`MPNet ` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted +42. :doc:`MPNet ` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted Pre-training for Language Understanding `__ by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. -42. :doc:`MT5 ` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained +43. :doc:`MT5 ` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained text-to-text transformer `__ by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. -43. :doc:`Pegasus ` (from Google) released with the paper `PEGASUS: Pre-training with Extracted +44. :doc:`Pegasus ` (from Google) released with the paper `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization `__> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. -44. :doc:`ProphetNet ` (from Microsoft Research) released with the paper `ProphetNet: Predicting +45. :doc:`ProphetNet ` (from Microsoft Research) released with the paper `ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training `__ by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. -45. :doc:`Reformer ` (from Google Research) released with the paper `Reformer: The Efficient +46. :doc:`Reformer ` (from Google Research) released with the paper `Reformer: The Efficient Transformer `__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. -46. :doc:`RoBERTa ` (from Facebook), released together with the paper a `Robustly Optimized BERT +47. :doc:`RoBERTa ` (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach `__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. -47. :doc:`RoFormer ` (from ZhuiyiTechnology), released together with the paper a `RoFormer: +48. :doc:`RoFormer ` (from ZhuiyiTechnology), released together with the paper a `RoFormer: Enhanced Transformer with Rotary Position Embedding `__ by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu. -48. :doc:`SpeechToTextTransformer ` (from Facebook), released together with the paper +49. :doc:`SpeechToTextTransformer ` (from Facebook), released together with the paper `fairseq S2T: Fast Speech-to-Text Modeling with fairseq `__ by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. -49. :doc:`SqueezeBert ` released with the paper `SqueezeBERT: What can computer vision teach NLP +50. :doc:`SqueezeBert ` released with the paper `SqueezeBERT: What can computer vision teach NLP about efficient neural networks? `__ by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer. -50. :doc:`T5 ` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a +51. :doc:`T5 ` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer `__ by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. -51. :doc:`TAPAS ` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via +52. :doc:`TAPAS ` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via Pre-training `__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. -52. :doc:`Transformer-XL ` (from Google/CMU) released with the paper `Transformer-XL: +53. :doc:`Transformer-XL ` (from Google/CMU) released with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context `__ by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov. -53. :doc:`Vision Transformer (ViT) ` (from Google AI) released with the paper `An Image is Worth 16x16 +54. :doc:`Vision Transformer (ViT) ` (from Google AI) released with the paper `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale `__ by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. -54. :doc:`VisualBERT ` (from UCLA NLP) released with the paper `VisualBERT: A Simple and +55. :doc:`VisualBERT ` (from UCLA NLP) released with the paper `VisualBERT: A Simple and Performant Baseline for Vision and Language `__ by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang. -55. :doc:`Wav2Vec2 ` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for +56. :doc:`Wav2Vec2 ` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations `__ by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. -56. :doc:`XLM ` (from Facebook) released together with the paper `Cross-lingual Language Model +57. :doc:`XLM ` (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining `__ by Guillaume Lample and Alexis Conneau. -57. :doc:`XLM-ProphetNet ` (from Microsoft Research) released with the paper `ProphetNet: +58. :doc:`XLM-ProphetNet ` (from Microsoft Research) released with the paper `ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training `__ by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. -58. :doc:`XLM-RoBERTa ` (from Facebook AI), released together with the paper `Unsupervised +59. :doc:`XLM-RoBERTa ` (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale `__ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. -59. :doc:`XLNet ` (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive +60. :doc:`XLNet ` (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive Pretraining for Language Understanding `__ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le. -60. :doc:`XLSR-Wav2Vec2 ` (from Facebook AI) released with the paper `Unsupervised +61. :doc:`XLSR-Wav2Vec2 ` (from Facebook AI) released with the paper `Unsupervised Cross-Lingual Representation Learning For Speech Recognition `__ by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli. @@ -318,6 +321,8 @@ Flax), PyTorch, and/or TensorFlow. +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ | ConvBERT | ✅ | ✅ | ✅ | ✅ | ❌ | +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| DETR | ❌ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ | DPR | ✅ | ✅ | ✅ | ✅ | ❌ | +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ | DeBERTa | ✅ | ✅ | ✅ | ❌ | ❌ | @@ -502,6 +507,7 @@ Flax), PyTorch, and/or TensorFlow. model_doc/deberta model_doc/deberta_v2 model_doc/deit + model_doc/detr model_doc/dialogpt model_doc/distilbert model_doc/dpr diff --git a/docs/source/model_doc/detr.rst b/docs/source/model_doc/detr.rst new file mode 100644 index 00000000000..dbd1fb99aad --- /dev/null +++ b/docs/source/model_doc/detr.rst @@ -0,0 +1,202 @@ +.. + Copyright 2021 The HuggingFace Team. All rights reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + specific language governing permissions and limitations under the License. + +DETR +----------------------------------------------------------------------------------------------------------------------- + +Overview +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The DETR model was proposed in `End-to-End Object Detection with Transformers `__ by +Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov and Sergey Zagoruyko. DETR +consists of a convolutional backbone followed by an encoder-decoder Transformer which can be trained end-to-end for +object detection. It greatly simplifies a lot of the complexity of models like Faster-R-CNN and Mask-R-CNN, which use +things like region proposals, non-maximum suppression procedure and anchor generation. Moreover, DETR can also be +naturally extended to perform panoptic segmentation, by simply adding a mask head on top of the decoder outputs. + +The abstract from the paper is the following: + +*We present a new method that views object detection as a direct set prediction problem. Our approach streamlines the +detection pipeline, effectively removing the need for many hand-designed components like a non-maximum suppression +procedure or anchor generation that explicitly encode our prior knowledge about the task. The main ingredients of the +new framework, called DEtection TRansformer or DETR, are a set-based global loss that forces unique predictions via +bipartite matching, and a transformer encoder-decoder architecture. Given a fixed small set of learned object queries, +DETR reasons about the relations of the objects and the global image context to directly output the final set of +predictions in parallel. The new model is conceptually simple and does not require a specialized library, unlike many +other modern detectors. DETR demonstrates accuracy and run-time performance on par with the well-established and +highly-optimized Faster RCNN baseline on the challenging COCO object detection dataset. Moreover, DETR can be easily +generalized to produce panoptic segmentation in a unified manner. We show that it significantly outperforms competitive +baselines.* + +This model was contributed by `nielsr `__. The original code can be found `here +`__. + +Here's a TLDR explaining how :class:`~transformers.DetrForObjectDetection` works: + +First, an image is sent through a pre-trained convolutional backbone (in the paper, the authors use +ResNet-50/ResNet-101). Let's assume we also add a batch dimension. This means that the input to the backbone is a +tensor of shape :obj:`(batch_size, 3, height, width)`, assuming the image has 3 color channels (RGB). The CNN backbone +outputs a new lower-resolution feature map, typically of shape :obj:`(batch_size, 2048, height/32, width/32)`. This is +then projected to match the hidden dimension of the Transformer of DETR, which is :obj:`256` by default, using a +:obj:`nn.Conv2D` layer. So now, we have a tensor of shape :obj:`(batch_size, 256, height/32, width/32).` Next, the +feature map is flattened and transposed to obtain a tensor of shape :obj:`(batch_size, seq_len, d_model)` = +:obj:`(batch_size, width/32*height/32, 256)`. So a difference with NLP models is that the sequence length is actually +longer than usual, but with a smaller :obj:`d_model` (which in NLP is typically 768 or higher). + +Next, this is sent through the encoder, outputting :obj:`encoder_hidden_states` of the same shape (you can consider +these as image features). Next, so-called **object queries** are sent through the decoder. This is a tensor of shape +:obj:`(batch_size, num_queries, d_model)`, with :obj:`num_queries` typically set to 100 and initialized with zeros. +These input embeddings are learnt positional encodings that the authors refer to as object queries, and similarly to +the encoder, they are added to the input of each attention layer. Each object query will look for a particular object +in the image. The decoder updates these embeddings through multiple self-attention and encoder-decoder attention layers +to output :obj:`decoder_hidden_states` of the same shape: :obj:`(batch_size, num_queries, d_model)`. Next, two heads +are added on top for object detection: a linear layer for classifying each object query into one of the objects or "no +object", and a MLP to predict bounding boxes for each query. + +The model is trained using a **bipartite matching loss**: so what we actually do is compare the predicted classes + +bounding boxes of each of the N = 100 object queries to the ground truth annotations, padded up to the same length N +(so if an image only contains 4 objects, 96 annotations will just have a "no object" as class and "no bounding box" as +bounding box). The `Hungarian matching algorithm `__ is used to find +an optimal one-to-one mapping of each of the N queries to each of the N annotations. Next, standard cross-entropy (for +the classes) and a linear combination of the L1 and `generalized IoU loss `__ (for the +bounding boxes) are used to optimize the parameters of the model. + +DETR can be naturally extended to perform panoptic segmentation (which unifies semantic segmentation and instance +segmentation). :class:`~transformers.DetrForSegmentation` adds a segmentation mask head on top of +:class:`~transformers.DetrForObjectDetection`. The mask head can be trained either jointly, or in a two steps process, +where one first trains a :class:`~transformers.DetrForObjectDetection` model to detect bounding boxes around both +"things" (instances) and "stuff" (background things like trees, roads, sky), then freeze all the weights and train only +the mask head for 25 epochs. Experimentally, these two approaches give similar results. Note that predicting boxes is +required for the training to be possible, since the Hungarian matching is computed using distances between boxes. + +Tips: + +- DETR uses so-called **object queries** to detect objects in an image. The number of queries determines the maximum + number of objects that can be detected in a single image, and is set to 100 by default (see parameter + :obj:`num_queries` of :class:`~transformers.DetrConfig`). Note that it's good to have some slack (in COCO, the + authors used 100, while the maximum number of objects in a COCO image is ~70). +- The decoder of DETR updates the query embeddings in parallel. This is different from language models like GPT-2, + which use autoregressive decoding instead of parallel. Hence, no causal attention mask is used. +- DETR adds position embeddings to the hidden states at each self-attention and cross-attention layer before projecting + to queries and keys. For the position embeddings of the image, one can choose between fixed sinusoidal or learned + absolute position embeddings. By default, the parameter :obj:`position_embedding_type` of + :class:`~transformers.DetrConfig` is set to :obj:`"sine"`. +- During training, the authors of DETR did find it helpful to use auxiliary losses in the decoder, especially to help + the model output the correct number of objects of each class. If you set the parameter :obj:`auxiliary_loss` of + :class:`~transformers.DetrConfig` to :obj:`True`, then prediction feedforward neural networks and Hungarian losses + are added after each decoder layer (with the FFNs sharing parameters). +- If you want to train the model in a distributed environment across multiple nodes, then one should update the + `num_boxes` variable in the `DetrLoss` class of `modeling_detr.py`. When training on multiple nodes, this should be + set to the average number of target boxes across all nodes, as can be seen in the original implementation `here + `__. +- :class:`~transformers.DetrForObjectDetection` and :class:`~transformers.DetrForSegmentation` can be initialized with + any convolutional backbone available in the `timm library `__. + Initializing with a MobileNet backbone for example can be done by setting the :obj:`backbone` attribute of + :class:`~transformers.DetrConfig` to :obj:`"tf_mobilenetv3_small_075"`, and then initializing the model with that + config. +- DETR resizes the input images such that the shortest side is at least a certain amount of pixels while the longest is + at most 1333 pixels. At training time, scale augmentation is used such that the shortest side is randomly set to at + least 480 and at most 800 pixels. At inference time, the shortest side is set to 800. One can use + :class:`~transformers.DetrFeatureExtractor` to prepare images (and optional annotations in COCO format) for the + model. Due to this resizing, images in a batch can have different sizes. DETR solves this by padding images up to the + largest size in a batch, and by creating a pixel mask that indicates which pixels are real/which are padding. + Alternatively, one can also define a custom :obj:`collate_fn` in order to batch images together, using + :meth:`~transformers.DetrFeatureExtractor.pad_and_create_pixel_mask`. +- The size of the images will determine the amount of memory being used, and will thus determine the :obj:`batch_size`. + It is advised to use a batch size of 2 per GPU. See `this Github thread + `__ for more info. + +As a summary, consider the following table: + ++---------------------------------------------+---------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------------+ +| **Task** | **Object detection** | **Instance segmentation** | **Panoptic segmentation** | ++---------------------------------------------+---------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------------+ +| **Description** | Predicting bounding boxes and class labels around | Predicting masks around objects (i.e. instances) in an image | Predicting masks around both objects (i.e. instances) as well as | +| | objects in an image | | "stuff" (i.e. background things like trees and roads) in an image | ++---------------------------------------------+---------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------------+ +| **Model** | :class:`~transformers.DetrForObjectDetection` | :class:`~transformers.DetrForSegmentation` | :class:`~transformers.DetrForSegmentation` | ++---------------------------------------------+---------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------------+ +| **Example dataset** | COCO detection | COCO detection, | COCO panoptic | +| | | COCO panoptic | | ++---------------------------------------------+---------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------------+ +| **Format of annotations to provide to** | {‘image_id’: int, | {‘image_id’: int, | {‘file_name: str, | +| :class:`~transformers.DetrFeatureExtractor` | ‘annotations’: List[Dict]}, each Dict being a COCO | ‘annotations’: [List[Dict]] } (in case of COCO detection) | ‘image_id: int, | +| | object annotation (containing keys "image_id", | | ‘segments_info’: List[Dict] } | +| | | or | | +| | | | and masks_path (path to directory containing PNG files of the masks) | +| | | {‘file_name’: str, | | +| | | ‘image_id’: int, | | +| | | ‘segments_info’: List[Dict]} (in case of COCO panoptic) | | ++---------------------------------------------+---------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------------+ +| **Postprocessing** (i.e. converting the | :meth:`~transformers.DetrFeatureExtractor.post_process` | :meth:`~transformers.DetrFeatureExtractor.post_process_segmentation` | :meth:`~transformers.DetrFeatureExtractor.post_process_segmentation`, | +| output of the model to COCO API) | | | :meth:`~transformers.DetrFeatureExtractor.post_process_panoptic` | ++---------------------------------------------+---------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------------+ +| **evaluators** | :obj:`CocoEvaluator` with iou_types = “bbox” | :obj:`CocoEvaluator` with iou_types = “bbox”, “segm” | :obj:`CocoEvaluator` with iou_tupes = “bbox, “segm” | +| | | | | +| | | | :obj:`PanopticEvaluator` | ++---------------------------------------------+---------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------------+ + +In short, one should prepare the data either in COCO detection or COCO panoptic format, then use +:class:`~transformers.DetrFeatureExtractor` to create :obj:`pixel_values`, :obj:`pixel_mask` and optional +:obj:`labels`, which can then be used to train (or fine-tune) a model. For evaluation, one should first convert the +outputs of the model using one of the postprocessing methods of :class:`~transformers.DetrFeatureExtractor`. These can +be be provided to either :obj:`CocoEvaluator` or :obj:`PanopticEvaluator`, which allow you to calculate metrics like +mean Average Precision (mAP) and Panoptic Quality (PQ). The latter objects are implemented in the `original repository +`__. See the example notebooks for more info regarding evaluation. + + +DETR specific outputs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.models.detr.modeling_detr.DetrModelOutput + :members: + +.. autoclass:: transformers.models.detr.modeling_detr.DetrObjectDetectionOutput + :members: + +.. autoclass:: transformers.models.detr.modeling_detr.DetrSegmentationOutput + :members: + + +DetrConfig +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.DetrConfig + :members: + + +DetrFeatureExtractor +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.DetrFeatureExtractor + :members: __call__, pad_and_create_pixel_mask, post_process, post_process_segmentation, post_process_panoptic + + +DetrModel +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.DetrModel + :members: forward + + +DetrForObjectDetection +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.DetrForObjectDetection + :members: forward + + +DetrForSegmentation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.DetrForSegmentation + :members: forward diff --git a/setup.py b/setup.py index b8ed916b0e0..32e4608102e 100644 --- a/setup.py +++ b/setup.py @@ -142,6 +142,7 @@ _deps = [ "tensorflow-cpu>=2.3", "tensorflow>=2.3", "timeout-decorator", + "timm", "tokenizers>=0.10.1,<0.11", "torch>=1.0", "torchaudio", @@ -249,6 +250,7 @@ extras["integrations"] = extras["optuna"] + extras["ray"] extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette") extras["speech"] = deps_list("soundfile", "torchaudio") extras["vision"] = deps_list("Pillow") +extras["timm"] = deps_list("timm") extras["sentencepiece"] = deps_list("sentencepiece", "protobuf") extras["testing"] = ( @@ -270,6 +272,7 @@ extras["all"] = ( + extras["speech"] + extras["vision"] + extras["integrations"] + + extras["timm"] ) extras["docs_specific"] = deps_list( diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index c699983ac3d..386d64892cc 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -47,6 +47,7 @@ from .file_utils import ( is_sentencepiece_available, is_speech_available, is_tf_available, + is_timm_available, is_tokenizers_available, is_torch_available, is_vision_available, @@ -101,10 +102,12 @@ _import_structure = { "is_flax_available", "is_psutil_available", "is_py3nvml_available", + "is_scipy_available", "is_sentencepiece_available", "is_sklearn_available", "is_speech_available", "is_tf_available", + "is_timm_available", "is_tokenizers_available", "is_torch_available", "is_torch_tpu_available", @@ -180,6 +183,7 @@ _import_structure = { "models.deberta": ["DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaConfig", "DebertaTokenizer"], "models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"], "models.deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig"], + "models.detr": ["DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DetrConfig"], "models.distilbert": ["DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DistilBertConfig", "DistilBertTokenizer"], "models.dpr": [ "DPR_PRETRAINED_CONFIG_ARCHIVE_MAP", @@ -405,6 +409,7 @@ if is_vision_available(): _import_structure["models.clip"].append("CLIPFeatureExtractor") _import_structure["models.clip"].append("CLIPProcessor") _import_structure["models.deit"].append("DeiTFeatureExtractor") + _import_structure["models.detr"].append("DetrFeatureExtractor") _import_structure["models.vit"].append("ViTFeatureExtractor") else: from .utils import dummy_vision_objects @@ -413,6 +418,23 @@ else: name for name in dir(dummy_vision_objects) if not name.startswith("_") ] +# Timm-backed objects +if is_timm_available() and is_vision_available(): + _import_structure["models.detr"].extend( + [ + "DETR_PRETRAINED_MODEL_ARCHIVE_LIST", + "DetrForObjectDetection", + "DetrForSegmentation", + "DetrModel", + ] + ) +else: + from .utils import dummy_timm_objects + + _import_structure["utils.dummy_timm_objects"] = [ + name for name in dir(dummy_timm_objects) if not name.startswith("_") + ] + # PyTorch-backed objects if is_torch_available(): _import_structure["benchmark.benchmark"] = ["PyTorchBenchmark"] @@ -489,6 +511,7 @@ if is_torch_available(): "MODEL_FOR_MASKED_LM_MAPPING", "MODEL_FOR_MULTIPLE_CHOICE_MAPPING", "MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING", + "MODEL_FOR_OBJECT_DETECTION_MAPPING", "MODEL_FOR_PRETRAINING_MAPPING", "MODEL_FOR_QUESTION_ANSWERING_MAPPING", "MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING", @@ -1587,10 +1610,12 @@ if TYPE_CHECKING: is_flax_available, is_psutil_available, is_py3nvml_available, + is_scipy_available, is_sentencepiece_available, is_sklearn_available, is_speech_available, is_tf_available, + is_timm_available, is_tokenizers_available, is_torch_available, is_torch_tpu_available, @@ -1666,6 +1691,7 @@ if TYPE_CHECKING: from .models.deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig, DebertaTokenizer from .models.deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig + from .models.detr import DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DetrConfig from .models.distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig, DistilBertTokenizer from .models.dpr import ( DPR_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -1863,13 +1889,23 @@ if TYPE_CHECKING: from .image_utils import ImageFeatureExtractionMixin from .models.clip import CLIPFeatureExtractor, CLIPProcessor from .models.deit import DeiTFeatureExtractor + from .models.detr import DetrFeatureExtractor from .models.vit import ViTFeatureExtractor else: from .utils.dummy_vision_objects import * # Modeling - if is_torch_available(): + if is_timm_available() and is_vision_available(): + from .models.detr import ( + DETR_PRETRAINED_MODEL_ARCHIVE_LIST, + DetrForObjectDetection, + DetrForSegmentation, + DetrModel, + ) + else: + from .utils.dummy_timm_objects import * + if is_torch_available(): # Benchmarks from .benchmark.benchmark import PyTorchBenchmark from .benchmark.benchmark_args import PyTorchBenchmarkArguments @@ -1939,6 +1975,7 @@ if TYPE_CHECKING: MODEL_FOR_MASKED_LM_MAPPING, MODEL_FOR_MULTIPLE_CHOICE_MAPPING, MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING, + MODEL_FOR_OBJECT_DETECTION_MAPPING, MODEL_FOR_PRETRAINING_MAPPING, MODEL_FOR_QUESTION_ANSWERING_MAPPING, MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index ec055db25bd..0f4c9573991 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -59,6 +59,7 @@ deps = { "tensorflow-cpu": "tensorflow-cpu>=2.3", "tensorflow": "tensorflow>=2.3", "timeout-decorator": "timeout-decorator", + "timm": "timm", "tokenizers": "tokenizers>=0.10.1,<0.11", "torch": "torch>=1.0", "torchaudio": "torchaudio", diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index 9a55fe18edc..51daa86cb34 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -174,6 +174,14 @@ except importlib_metadata.PackageNotFoundError: _soundfile_available = False +_timm_available = importlib.util.find_spec("timm") is not None +try: + _timm_version = importlib_metadata.version("timm") + logger.debug(f"Successfully imported timm version {_timm_version}") +except importlib_metadata.PackageNotFoundError: + _timm_available = False + + _torchaudio_available = importlib.util.find_spec("torchaudio") is not None try: _torchaudio_version = importlib_metadata.version("torchaudio") @@ -317,12 +325,14 @@ def is_faiss_available(): return _faiss_available +def is_scipy_available(): + return importlib.util.find_spec("scipy") is not None + + def is_sklearn_available(): if importlib.util.find_spec("sklearn") is None: return False - if importlib.util.find_spec("scipy") is None: - return False - return importlib.util.find_spec("sklearn.metrics") and importlib.util.find_spec("scipy.stats") + return is_scipy_available() and importlib.util.find_spec("sklearn.metrics") def is_sentencepiece_available(): @@ -411,6 +421,10 @@ def is_soundfile_availble(): return _soundfile_available +def is_timm_available(): + return _timm_available + + def is_torchaudio_available(): return _torchaudio_available @@ -536,12 +550,24 @@ explained here: https://pandas.pydata.org/pandas-docs/stable/getting_started/ins """ +# docstyle-ignore +SCIPY_IMPORT_ERROR = """ +{0} requires the scipy library but it was not found in your environment. You can install it with pip: +`pip install scipy` +""" + + # docstyle-ignore SPEECH_IMPORT_ERROR = """ {0} requires the torchaudio library but it was not found in your environment. You can install it with pip: `pip install torchaudio` """ +# docstyle-ignore +TIMM_IMPORT_ERROR = """ +{0} requires the timm library but it was not found in your environment. You can install it with pip: +`pip install timm` +""" # docstyle-ignore VISION_IMPORT_ERROR = """ @@ -562,9 +588,11 @@ BACKENDS_MAPPING = OrderedDict( ("sklearn", (is_sklearn_available, SKLEARN_IMPORT_ERROR)), ("speech", (is_speech_available, SPEECH_IMPORT_ERROR)), ("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)), + ("timm", (is_timm_available, TIMM_IMPORT_ERROR)), ("tokenizers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)), ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)), ("vision", (is_vision_available, VISION_IMPORT_ERROR)), + ("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)), ] ) diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 76075014535..f4e5c09f568 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -36,6 +36,7 @@ from . import ( ctrl, deberta, deit, + detr, dialogpt, distilbert, dpr, diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py index deb976d3415..a620c0a75dd 100644 --- a/src/transformers/models/auto/__init__.py +++ b/src/transformers/models/auto/__init__.py @@ -35,6 +35,7 @@ if is_torch_available(): "MODEL_FOR_MASKED_LM_MAPPING", "MODEL_FOR_MULTIPLE_CHOICE_MAPPING", "MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING", + "MODEL_FOR_OBJECT_DETECTION_MAPPING", "MODEL_FOR_PRETRAINING_MAPPING", "MODEL_FOR_QUESTION_ANSWERING_MAPPING", "MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING", @@ -119,6 +120,7 @@ if TYPE_CHECKING: MODEL_FOR_MASKED_LM_MAPPING, MODEL_FOR_MULTIPLE_CHOICE_MAPPING, MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING, + MODEL_FOR_OBJECT_DETECTION_MAPPING, MODEL_FOR_PRETRAINING_MAPPING, MODEL_FOR_QUESTION_ANSWERING_MAPPING, MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index e95d7cac12f..c103622d698 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -39,6 +39,7 @@ from ..ctrl.configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLCo from ..deberta.configuration_deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig from ..deberta_v2.configuration_deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config from ..deit.configuration_deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig +from ..detr.configuration_detr import DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DetrConfig from ..distilbert.configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig from ..dpr.configuration_dpr import DPR_PRETRAINED_CONFIG_ARCHIVE_MAP, DPRConfig from ..electra.configuration_electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig @@ -99,6 +100,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict( BIGBIRD_PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP, DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP, + DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP, BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP, MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -156,6 +158,7 @@ CONFIG_MAPPING = OrderedDict( ("bigbird_pegasus", BigBirdPegasusConfig), ("deit", DeiTConfig), ("luke", LukeConfig), + ("detr", DetrConfig), ("gpt_neo", GPTNeoConfig), ("big_bird", BigBirdConfig), ("speech_to_text", Speech2TextConfig), @@ -219,6 +222,7 @@ MODEL_NAMES_MAPPING = OrderedDict( ("bigbird_pegasus", "BigBirdPegasus"), ("deit", "DeiT"), ("luke", "LUKE"), + ("detr", "DETR"), ("gpt_neo", "GPT Neo"), ("big_bird", "BigBird"), ("speech_to_text", "Speech2Text"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 3cf3062f433..8b144b83c71 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -106,6 +106,7 @@ from ..deberta_v2.modeling_deberta_v2 import ( DebertaV2Model, ) from ..deit.modeling_deit import DeiTForImageClassification, DeiTForImageClassificationWithTeacher, DeiTModel +from ..detr.modeling_detr import DetrForObjectDetection, DetrModel from ..distilbert.modeling_distilbert import ( DistilBertForMaskedLM, DistilBertForMultipleChoice, @@ -316,6 +317,7 @@ from .configuration_auto import ( DebertaConfig, DebertaV2Config, DeiTConfig, + DetrConfig, DistilBertConfig, DPRConfig, ElectraConfig, @@ -372,6 +374,7 @@ MODEL_MAPPING = OrderedDict( (BigBirdPegasusConfig, BigBirdPegasusModel), (DeiTConfig, DeiTModel), (LukeConfig, LukeModel), + (DetrConfig, DetrModel), (GPTNeoConfig, GPTNeoModel), (BigBirdConfig, BigBirdModel), (Speech2TextConfig, Speech2TextModel), @@ -586,6 +589,13 @@ MODEL_FOR_MASKED_LM_MAPPING = OrderedDict( ] ) +MODEL_FOR_OBJECT_DETECTION_MAPPING = OrderedDict( + [ + # Model for Object Detection mapping + (DetrConfig, DetrForObjectDetection), + ] +) + MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict( [ # Model for Seq2Seq Causal LM mapping diff --git a/src/transformers/models/detr/__init__.py b/src/transformers/models/detr/__init__.py new file mode 100644 index 00000000000..b0dd3e2c674 --- /dev/null +++ b/src/transformers/models/detr/__init__.py @@ -0,0 +1,72 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all. + +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from ...file_utils import _BaseLazyModule, is_timm_available, is_vision_available + + +_import_structure = { + "configuration_detr": ["DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DetrConfig"], +} + +if is_vision_available(): + _import_structure["feature_extraction_detr"] = ["DetrFeatureExtractor"] + +if is_timm_available(): + _import_structure["modeling_detr"] = [ + "DETR_PRETRAINED_MODEL_ARCHIVE_LIST", + "DetrForObjectDetection", + "DetrForSegmentation", + "DetrModel", + "DetrPreTrainedModel", + ] + + +if TYPE_CHECKING: + from .configuration_detr import DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DetrConfig + + if is_vision_available(): + from .feature_extraction_detr import DetrFeatureExtractor + + if is_timm_available(): + from .modeling_detr import ( + DETR_PRETRAINED_MODEL_ARCHIVE_LIST, + DetrForObjectDetection, + DetrForSegmentation, + DetrModel, + DetrPreTrainedModel, + ) + +else: + import importlib + import os + import sys + + class _LazyModule(_BaseLazyModule): + """ + Module class that surfaces all objects but only performs associated imports when the objects are requested. + """ + + __file__ = globals()["__file__"] + __path__ = [os.path.dirname(__file__)] + + def _get_module(self, module_name: str): + return importlib.import_module("." + module_name, self.__name__) + + sys.modules[__name__] = _LazyModule(__name__, _import_structure) diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py new file mode 100644 index 00000000000..52625b1494c --- /dev/null +++ b/src/transformers/models/detr/configuration_detr.py @@ -0,0 +1,205 @@ +# coding=utf-8 +# Copyright 2021 Facebook AI Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" DETR model configuration """ + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + +DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "facebook/detr-resnet-50": "https://huggingface.co/facebook/detr-resnet-50/resolve/main/config.json", + # See all DETR models at https://huggingface.co/models?filter=detr +} + + +class DetrConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a :class:`~transformers.DetrModel`. It is used to + instantiate a DETR model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the DETR `facebook/detr-resnet-50 + `__ architecture. + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model + outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. + + + Args: + num_queries (:obj:`int`, `optional`, defaults to 100): + Number of object queries, i.e. detection slots. This is the maximal number of objects + :class:`~transformers.DetrModel` can detect in a single image. For COCO, we recommend 100 queries. + d_model (:obj:`int`, `optional`, defaults to 256): + Dimension of the layers. + encoder_layers (:obj:`int`, `optional`, defaults to 6): + Number of encoder layers. + decoder_layers (:obj:`int`, `optional`, defaults to 6): + Number of decoder layers. + encoder_attention_heads (:obj:`int`, `optional`, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_attention_heads (:obj:`int`, `optional`, defaults to 8): + Number of attention heads for each attention layer in the Transformer decoder. + decoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + encoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"relu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. + dropout (:obj:`float`, `optional`, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (:obj:`float`, `optional`, defaults to 0.0): + The dropout ratio for the attention probabilities. + activation_dropout (:obj:`float`, `optional`, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + classifier_dropout (:obj:`float`, `optional`, defaults to 0.0): + The dropout ratio for classifier. + max_position_embeddings (:obj:`int`, `optional`, defaults to 1024): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + init_std (:obj:`float`, `optional`, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + init_xavier_std (:obj:`float`, `optional`, defaults to 1): + The scaling factor used for the Xavier initialization gain in the HM Attention map module. + encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0): + The LayerDrop probability for the encoder. See the `LayerDrop paper `__ for more details. + decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0): + The LayerDrop probability for the decoder. See the `LayerDrop paper `__ for more details. + auxiliary_loss (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether auxiliary decoding losses (loss at each decoder layer) are to be used. + position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"sine"`): + Type of position embeddings to be used on top of the image features. One of :obj:`"sine"` or + :obj:`"learned"`. + backbone (:obj:`str`, `optional`, defaults to :obj:`"resnet50"`): + Name of convolutional backbone to use. Supports any convolutional backbone from the timm package. For a + list of all available models, see `this page + `__. + dilation (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to replace stride with dilation in the last convolutional block (DC5). + class_cost (:obj:`float`, `optional`, defaults to 1): + Relative weight of the classification error in the Hungarian matching cost. + bbox_cost (:obj:`float`, `optional`, defaults to 5): + Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost. + giou_cost (:obj:`float`, `optional`, defaults to 2): + Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost. + mask_loss_coefficient (:obj:`float`, `optional`, defaults to 1): + Relative weight of the Focal loss in the panoptic segmentation loss. + dice_loss_coefficient (:obj:`float`, `optional`, defaults to 1): + Relative weight of the DICE/F-1 loss in the panoptic segmentation loss. + bbox_loss_coefficient (:obj:`float`, `optional`, defaults to 5): + Relative weight of the L1 bounding box loss in the object detection loss. + giou_loss_coefficient (:obj:`float`, `optional`, defaults to 2): + Relative weight of the generalized IoU loss in the object detection loss. + eos_coefficient (:obj:`float`, `optional`, defaults to 0.1): + Relative classification weight of the 'no-object' class in the object detection loss. + + Examples:: + + >>> from transformers import DetrModel, DetrConfig + + >>> # Initializing a DETR facebook/detr-resnet-50 style configuration + >>> configuration = DetrConfig() + + >>> # Initializing a model from the facebook/detr-resnet-50 style configuration + >>> model = DetrModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + """ + model_type = "detr" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + num_queries=100, + max_position_embeddings=1024, + encoder_layers=6, + encoder_ffn_dim=2048, + encoder_attention_heads=8, + decoder_layers=6, + decoder_ffn_dim=2048, + decoder_attention_heads=8, + encoder_layerdrop=0.0, + decoder_layerdrop=0.0, + is_encoder_decoder=True, + activation_function="relu", + d_model=256, + dropout=0.1, + attention_dropout=0.0, + activation_dropout=0.0, + init_std=0.02, + init_xavier_std=1.0, + classifier_dropout=0.0, + scale_embedding=False, + auxiliary_loss=False, + position_embedding_type="sine", + backbone="resnet50", + dilation=False, + class_cost=1, + bbox_cost=5, + giou_cost=2, + mask_loss_coefficient=1, + dice_loss_coefficient=1, + bbox_loss_coefficient=5, + giou_loss_coefficient=2, + eos_coefficient=0.1, + **kwargs + ): + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + + self.num_queries = num_queries + self.max_position_embeddings = max_position_embeddings + self.d_model = d_model + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_layers = encoder_layers + self.encoder_attention_heads = encoder_attention_heads + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.activation_function = activation_function + self.init_std = init_std + self.init_xavier_std = init_xavier_std + self.encoder_layerdrop = encoder_layerdrop + self.decoder_layerdrop = decoder_layerdrop + self.classifier_dropout = classifier_dropout + self.num_hidden_layers = encoder_layers + self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True + self.auxiliary_loss = auxiliary_loss + self.position_embedding_type = position_embedding_type + self.backbone = backbone + self.dilation = dilation + # Hungarian matcher + self.class_cost = class_cost + self.bbox_cost = bbox_cost + self.giou_cost = giou_cost + # Loss coefficients + self.mask_loss_coefficient = mask_loss_coefficient + self.dice_loss_coefficient = dice_loss_coefficient + self.bbox_loss_coefficient = bbox_loss_coefficient + self.giou_loss_coefficient = giou_loss_coefficient + self.eos_coefficient = eos_coefficient + + @property + def num_attention_heads(self) -> int: + return self.encoder_attention_heads + + @property + def hidden_size(self) -> int: + return self.d_model diff --git a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py new file mode 100644 index 00000000000..66165809759 --- /dev/null +++ b/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py @@ -0,0 +1,273 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert DETR checkpoints.""" + + +import argparse +from collections import OrderedDict +from pathlib import Path + +import torch +from PIL import Image + +import requests +from transformers import DetrConfig, DetrFeatureExtractor, DetrForObjectDetection, DetrForSegmentation +from transformers.utils import logging +from transformers.utils.coco_classes import id2label + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + +# here we list all keys to be renamed (original name on the left, our name on the right) +rename_keys = [] +for i in range(6): + # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms + rename_keys.append( + (f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight") + ) + rename_keys.append( + (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias") + ) + rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight")) + rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias")) + rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight")) + rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias")) + rename_keys.append( + (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight") + ) + rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias")) + rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight")) + rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias")) + # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms + rename_keys.append( + (f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight") + ) + rename_keys.append( + (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias") + ) + rename_keys.append( + ( + f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight", + f"decoder.layers.{i}.encoder_attn.out_proj.weight", + ) + ) + rename_keys.append( + ( + f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias", + f"decoder.layers.{i}.encoder_attn.out_proj.bias", + ) + ) + rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight")) + rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias")) + rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight")) + rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias")) + rename_keys.append( + (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight") + ) + rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias")) + rename_keys.append( + (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight") + ) + rename_keys.append( + (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias") + ) + rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight")) + rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias")) + +# convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads +rename_keys.extend( + [ + ("input_proj.weight", "input_projection.weight"), + ("input_proj.bias", "input_projection.bias"), + ("query_embed.weight", "query_position_embeddings.weight"), + ("transformer.decoder.norm.weight", "decoder.layernorm.weight"), + ("transformer.decoder.norm.bias", "decoder.layernorm.bias"), + ("class_embed.weight", "class_labels_classifier.weight"), + ("class_embed.bias", "class_labels_classifier.bias"), + ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"), + ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"), + ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"), + ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"), + ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"), + ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"), + ] +) + + +def rename_key(state_dict, old, new): + val = state_dict.pop(old) + state_dict[new] = val + + +def rename_backbone_keys(state_dict): + new_state_dict = OrderedDict() + for key, value in state_dict.items(): + if "backbone.0.body" in key: + new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model") + new_state_dict[new_key] = value + else: + new_state_dict[key] = value + + return new_state_dict + + +def read_in_q_k_v(state_dict, is_panoptic=False): + prefix = "" + if is_panoptic: + prefix = "detr." + + # first: transformer encoder + for i in range(6): + # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias) + in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight") + in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] + state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] + state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] + state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] + state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] + state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] + # next: transformer decoder (which is a bit more complex because it also includes cross-attention) + for i in range(6): + # read in weights + bias of input projection layer of self-attention + in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight") + in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] + state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] + state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] + state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] + state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] + state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] + # read in weights + bias of input projection layer of cross-attention + in_proj_weight_cross_attn = state_dict.pop( + f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight" + ) + in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias") + # next, add query, keys and values (in that order) of cross-attention to the state dict + state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :] + state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256] + state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :] + state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512] + state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :] + state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:] + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + im = Image.open(requests.get(url, stream=True).raw) + + return im + + +@torch.no_grad() +def convert_detr_checkpoint(model_name, pytorch_dump_folder_path): + """ + Copy/paste/tweak model's weights to our DETR structure. + """ + + # load default config + config = DetrConfig() + # set backbone and dilation attributes + if "resnet101" in model_name: + config.backbone = "resnet101" + if "dc5" in model_name: + config.dilation = True + is_panoptic = "panoptic" in model_name + if is_panoptic: + config.num_labels = 250 + else: + config.num_labels = 91 + config.id2label = id2label + config.label2id = {v: k for k, v in id2label.items()} + + # load feature extractor + format = "coco_panoptic" if is_panoptic else "coco_detection" + feature_extractor = DetrFeatureExtractor(format=format) + + # prepare image + img = prepare_img() + encoding = feature_extractor(images=img, return_tensors="pt") + pixel_values = encoding["pixel_values"] + + logger.info(f"Converting model {model_name}...") + + # load original model from torch hub + detr = torch.hub.load("facebookresearch/detr", model_name, pretrained=True).eval() + state_dict = detr.state_dict() + # rename keys + for src, dest in rename_keys: + if is_panoptic: + src = "detr." + src + rename_key(state_dict, src, dest) + state_dict = rename_backbone_keys(state_dict) + # query, key and value matrices need special treatment + read_in_q_k_v(state_dict, is_panoptic=is_panoptic) + # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them + prefix = "detr.model." if is_panoptic else "model." + for key in state_dict.copy().keys(): + if is_panoptic: + if ( + key.startswith("detr") + and not key.startswith("class_labels_classifier") + and not key.startswith("bbox_predictor") + ): + val = state_dict.pop(key) + state_dict["detr.model" + key[4:]] = val + elif "class_labels_classifier" in key or "bbox_predictor" in key: + val = state_dict.pop(key) + state_dict["detr." + key] = val + elif key.startswith("bbox_attention") or key.startswith("mask_head"): + continue + else: + val = state_dict.pop(key) + state_dict[prefix + key] = val + else: + if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"): + val = state_dict.pop(key) + state_dict[prefix + key] = val + # finally, create HuggingFace model and load state dict + model = DetrForSegmentation(config) if is_panoptic else DetrForObjectDetection(config) + model.load_state_dict(state_dict) + model.eval() + # verify our conversion + original_outputs = detr(pixel_values) + outputs = model(pixel_values) + assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4) + assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4) + if is_panoptic: + assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4) + + # Save model and feature extractor + logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...") + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + model.save_pretrained(pytorch_dump_folder_path) + feature_extractor.save_pretrained(pytorch_dump_folder_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_name", default="detr_resnet50", type=str, help="Name of the DETR model you'd like to convert." + ) + parser.add_argument( + "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." + ) + args = parser.parse_args() + convert_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/detr/feature_extraction_detr.py b/src/transformers/models/detr/feature_extraction_detr.py new file mode 100644 index 00000000000..7c9b5526dc8 --- /dev/null +++ b/src/transformers/models/detr/feature_extraction_detr.py @@ -0,0 +1,890 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for DETR.""" + +import io +import pathlib +from collections import defaultdict +from typing import Dict, List, Optional, Union + +import numpy as np +from PIL import Image + +from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin +from ...file_utils import TensorType, is_torch_available +from ...image_utils import ImageFeatureExtractionMixin, is_torch_tensor +from ...utils import logging + + +if is_torch_available(): + import torch + import torch.nn.functional as F + +logger = logging.get_logger(__name__) + + +ImageInput = Union[Image.Image, np.ndarray, "torch.Tensor", List[Image.Image], List[np.ndarray], List["torch.Tensor"]] + + +# 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py +def center_to_corners_format(x): + """ + Converts a PyTorch tensor of bounding boxes of center format (center_x, center_y, width, height) to corners format + (x_0, y_0, x_1, y_1). + """ + x_c, y_c, w, h = x.unbind(-1) + b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)] + return torch.stack(b, dim=-1) + + +def corners_to_center_format(x): + """ + Converts a NumPy array of bounding boxes of shape (number of bounding boxes, 4) of corners format (x_0, y_0, x_1, + y_1) to center format (center_x, center_y, width, height). + """ + x_transposed = x.T + x0, y0, x1, y1 = x_transposed[0], x_transposed[1], x_transposed[2], x_transposed[3] + b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)] + return np.stack(b, axis=-1) + + +def masks_to_boxes(masks): + """ + Compute the bounding boxes around the provided panoptic segmentation masks. + + The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. + + Returns a [N, 4] tensor, with the boxes in corner (xyxy) format. + """ + if masks.size == 0: + return np.zeros((0, 4)) + + h, w = masks.shape[-2:] + + y = np.arange(0, h, dtype=np.float32) + x = np.arange(0, w, dtype=np.float32) + # see https://github.com/pytorch/pytorch/issues/50276 + y, x = np.meshgrid(y, x, indexing="ij") + + x_mask = masks * np.expand_dims(x, axis=0) + x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1) + x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool))) + x_min = x.filled(fill_value=1e8) + x_min = x_min.reshape(x_min.shape[0], -1).min(-1) + + y_mask = masks * np.expand_dims(y, axis=0) + y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1) + y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool))) + y_min = y.filled(fill_value=1e8) + y_min = y_min.reshape(y_min.shape[0], -1).min(-1) + + return np.stack([x_min, y_min, x_max, y_max], 1) + + +# 2 functions below copied from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py +# Copyright (c) 2018, Alexander Kirillov +# All rights reserved. +def rgb_to_id(color): + if isinstance(color, np.ndarray) and len(color.shape) == 3: + if color.dtype == np.uint8: + color = color.astype(np.int32) + return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2] + return int(color[0] + 256 * color[1] + 256 * 256 * color[2]) + + +def id_to_rgb(id_map): + if isinstance(id_map, np.ndarray): + id_map_copy = id_map.copy() + rgb_shape = tuple(list(id_map.shape) + [3]) + rgb_map = np.zeros(rgb_shape, dtype=np.uint8) + for i in range(3): + rgb_map[..., i] = id_map_copy % 256 + id_map_copy //= 256 + return rgb_map + color = [] + for _ in range(3): + color.append(id_map % 256) + id_map //= 256 + return color + + +class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin): + r""" + Constructs a DETR feature extractor. + + This feature extractor inherits from :class:`~transformers.FeatureExtractionMixin` which contains most of the main + methods. Users should refer to this superclass for more information regarding those methods. + + + Args: + format (:obj:`str`, `optional`, defaults to :obj:`"coco_detection"`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to resize the input to a certain :obj:`size`. + size (:obj:`int`, `optional`, defaults to 800): + Resize the input to the given size. Only has an effect if :obj:`do_resize` is set to :obj:`True`. If size + is a sequence like :obj:`(width, height)`, output size will be matched to this. If size is an int, smaller + edge of the image will be matched to this number. i.e, if :obj:`height > width`, then image will be + rescaled to :obj:`(size * height / width, size)`. + max_size (:obj:`int`, `optional`, defaults to :obj:`1333`): + The largest size an image dimension can have (otherwise it's capped). Only has an effect if + :obj:`do_resize` is set to :obj:`True`. + do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to normalize the input with mean and standard deviation. + image_mean (:obj:`int`, `optional`, defaults to :obj:`[0.485, 0.456, 0.406]s`): + The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean. + image_std (:obj:`int`, `optional`, defaults to :obj:`[0.229, 0.224, 0.225]`): + The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the + ImageNet std. + """ + + model_input_names = ["pixel_values", "pixel_mask"] + + def __init__( + self, + format="coco_detection", + do_resize=True, + size=800, + max_size=1333, + do_normalize=True, + image_mean=None, + image_std=None, + **kwargs + ): + super().__init__(**kwargs) + self.format = self._is_valid_format(format) + self.do_resize = do_resize + self.size = size + self.max_size = max_size + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else [0.485, 0.456, 0.406] # ImageNet mean + self.image_std = image_std if image_std is not None else [0.229, 0.224, 0.225] # ImageNet std + + def _is_valid_format(self, format): + if format not in ["coco_detection", "coco_panoptic"]: + raise ValueError(f"Format {format} not supported") + return format + + def prepare(self, image, target, return_segmentation_masks=False, masks_path=None): + if self.format == "coco_detection": + image, target = self.prepare_coco_detection(image, target, return_segmentation_masks) + return image, target + elif self.format == "coco_panoptic": + image, target = self.prepare_coco_panoptic(image, target, masks_path) + return image, target + else: + raise ValueError(f"Format {self.format} not supported") + + # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L33 + def convert_coco_poly_to_mask(self, segmentations, height, width): + + try: + from pycocotools import mask as coco_mask + except ImportError: + raise ImportError("Pycocotools is not installed in your environment.") + + masks = [] + for polygons in segmentations: + rles = coco_mask.frPyObjects(polygons, height, width) + mask = coco_mask.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = np.asarray(mask, dtype=np.uint8) + mask = np.any(mask, axis=2) + masks.append(mask) + if masks: + masks = np.stack(masks, axis=0) + else: + masks = np.zeros((0, height, width), dtype=np.uint8) + + return masks + + # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L50 + def prepare_coco_detection(self, image, target, return_segmentation_masks=False): + """ + Convert the target in COCO format into the format expected by DETR. + """ + w, h = image.size + + image_id = target["image_id"] + image_id = np.asarray([image_id], dtype=np.int64) + + # get all COCO annotations for the given image + anno = target["annotations"] + + anno = [obj for obj in anno if "iscrowd" not in obj or obj["iscrowd"] == 0] + + boxes = [obj["bbox"] for obj in anno] + # guard against no boxes via resizing + boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=w) + boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=h) + + classes = [obj["category_id"] for obj in anno] + classes = np.asarray(classes, dtype=np.int64) + + if return_segmentation_masks: + segmentations = [obj["segmentation"] for obj in anno] + masks = self.convert_coco_poly_to_mask(segmentations, h, w) + + keypoints = None + if anno and "keypoints" in anno[0]: + keypoints = [obj["keypoints"] for obj in anno] + keypoints = np.asarray(keypoints, dtype=np.float32) + num_keypoints = keypoints.shape[0] + if num_keypoints: + keypoints = keypoints.reshape((-1, 3)) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + boxes = boxes[keep] + classes = classes[keep] + if return_segmentation_masks: + masks = masks[keep] + if keypoints is not None: + keypoints = keypoints[keep] + + target = {} + target["boxes"] = boxes + target["class_labels"] = classes + if return_segmentation_masks: + target["masks"] = masks + target["image_id"] = image_id + if keypoints is not None: + target["keypoints"] = keypoints + + # for conversion to coco api + area = np.asarray([obj["area"] for obj in anno], dtype=np.float32) + iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno], dtype=np.int64) + target["area"] = area[keep] + target["iscrowd"] = iscrowd[keep] + + target["orig_size"] = np.asarray([int(h), int(w)], dtype=np.int64) + target["size"] = np.asarray([int(h), int(w)], dtype=np.int64) + + return image, target + + def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True): + w, h = image.size + ann_info = target.copy() + ann_path = pathlib.Path(masks_path) / ann_info["file_name"] + + if "segments_info" in ann_info: + masks = np.asarray(Image.open(ann_path), dtype=np.uint32) + masks = rgb_to_id(masks) + + ids = np.array([ann["id"] for ann in ann_info["segments_info"]]) + masks = masks == ids[:, None, None] + masks = np.asarray(masks, dtype=np.uint8) + + labels = np.asarray([ann["category_id"] for ann in ann_info["segments_info"]], dtype=np.int64) + + target = {} + target["image_id"] = np.asarray( + [ann_info["image_id"] if "image_id" in ann_info else ann_info["id"]], dtype=np.int64 + ) + if return_masks: + target["masks"] = masks + target["class_labels"] = labels + + target["boxes"] = masks_to_boxes(masks) + + target["size"] = np.asarray([int(h), int(w)], dtype=np.int64) + target["orig_size"] = np.asarray([int(h), int(w)], dtype=np.int64) + if "segments_info" in ann_info: + target["iscrowd"] = np.asarray([ann["iscrowd"] for ann in ann_info["segments_info"]], dtype=np.int64) + target["area"] = np.asarray([ann["area"] for ann in ann_info["segments_info"]], dtype=np.float32) + + return image, target + + def _resize(self, image, size, target=None, max_size=None): + """ + Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller + edge of the image will be matched to this number. + + If given, also resize the target accordingly. + """ + if not isinstance(image, Image.Image): + image = self.to_pil_image(image) + + def get_size_with_aspect_ratio(image_size, size, max_size=None): + w, h = image_size + if max_size is not None: + min_original_size = float(min((w, h))) + max_original_size = float(max((w, h))) + if max_original_size / min_original_size * size > max_size: + size = int(round(max_size * min_original_size / max_original_size)) + + if (w <= h and w == size) or (h <= w and h == size): + return (h, w) + + if w < h: + ow = size + oh = int(size * h / w) + else: + oh = size + ow = int(size * w / h) + + return (oh, ow) + + def get_size(image_size, size, max_size=None): + if isinstance(size, (list, tuple)): + return size + else: + # size returned must be (w, h) since we use PIL to resize images + # so we revert the tuple + return get_size_with_aspect_ratio(image_size, size, max_size)[::-1] + + size = get_size(image.size, size, max_size) + rescaled_image = self.resize(image, size=size) + + if target is None: + return rescaled_image, None + + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size)) + ratio_width, ratio_height = ratios + + target = target.copy() + if "boxes" in target: + boxes = target["boxes"] + scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32) + target["boxes"] = scaled_boxes + + if "area" in target: + area = target["area"] + scaled_area = area * (ratio_width * ratio_height) + target["area"] = scaled_area + + w, h = size + target["size"] = np.asarray([h, w], dtype=np.int64) + + if "masks" in target: + # use PyTorch as current workaround + # TODO replace by self.resize + masks = torch.from_numpy(target["masks"][:, None]).float() + interpolated_masks = F.interpolate(masks, size=(h, w), mode="nearest")[:, 0] > 0.5 + target["masks"] = interpolated_masks.numpy() + + return rescaled_image, target + + def _normalize(self, image, mean, std, target=None): + """ + Normalize the image with a certain mean and std. + + If given, also normalize the target bounding boxes based on the size of the image. + """ + + image = self.normalize(image, mean=mean, std=std) + if target is None: + return image, None + + target = target.copy() + h, w = image.shape[-2:] + + if "boxes" in target: + boxes = target["boxes"] + boxes = corners_to_center_format(boxes) + boxes = boxes / np.asarray([w, h, w, h], dtype=np.float32) + target["boxes"] = boxes + + return image, target + + def __call__( + self, + images: ImageInput, + annotations: Union[List[Dict], List[List[Dict]]] = None, + return_segmentation_masks: Optional[bool] = False, + masks_path: Optional[pathlib.Path] = None, + pad_and_return_pixel_mask: Optional[bool] = True, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ) -> BatchFeature: + """ + Main method to prepare for the model one or several image(s) and optional annotations. Images are by default + padded up to the largest image in a batch, and a pixel mask is created that indicates which pixels are + real/which are padding. + + .. warning:: + + NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass + PIL images. + + Args: + images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a + number of channels, H and W are image height and width. + + annotations (:obj:`Dict`, :obj:`List[Dict]`, `optional`): + The corresponding annotations in COCO format. + + In case :class:`~transformers.DetrFeatureExtractor` was initialized with :obj:`format = + "coco_detection"`, the annotations for each image should have the following format: {'image_id': int, + 'annotations': [annotation]}, with the annotations being a list of COCO object annotations. + + In case :class:`~transformers.DetrFeatureExtractor` was initialized with :obj:`format = + "coco_panoptic"`, the annotations for each image should have the following format: {'image_id': int, + 'file_name': str, 'segments_info': [segment_info]} with segments_info being a list of COCO panoptic + annotations. + + return_segmentation_masks (:obj:`Dict`, :obj:`List[Dict]`, `optional`, defaults to :obj:`False`): + Whether to also return instance segmentation masks in case :obj:`format = "coco_detection"`. + + masks_path (:obj:`pathlib.Path`, `optional`): + Path to the directory containing the PNG files that store the class-agnostic image segmentations. Only + relevant in case :class:`~transformers.DetrFeatureExtractor` was initialized with :obj:`format = + "coco_panoptic"`. + + pad_and_return_pixel_mask (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to pad images up to the largest image in a batch and create a pixel mask. + + If left to the default, will return a pixel mask that is: + + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + + return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`): + If set, will return tensors instead of NumPy arrays. If set to :obj:`'pt'`, return PyTorch + :obj:`torch.Tensor` objects. + + Returns: + :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields: + + - **pixel_values** -- Pixel values to be fed to a model. + - **pixel_mask** -- Pixel mask to be fed to a model (when :obj:`pad_and_return_pixel_mask=True` or if + `"pixel_mask"` is in :obj:`self.model_input_names`). + """ + # Input type checking for clearer error + + valid_images = False + valid_annotations = False + valid_masks_path = False + + # Check that images has a valid type + if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images): + valid_images = True + elif isinstance(images, (list, tuple)): + if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]): + valid_images = True + + if not valid_images: + raise ValueError( + "Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example)," + "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)." + ) + + is_batched = bool( + isinstance(images, (list, tuple)) + and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0])) + ) + + # Check that annotations has a valid type + if annotations is not None: + if not is_batched: + if self.format == "coco_detection": + if isinstance(annotations, dict) and "image_id" in annotations and "annotations" in annotations: + if isinstance(annotations["annotations"], (list, tuple)): + # an image can have no annotations + if len(annotations["annotations"]) == 0 or isinstance(annotations["annotations"][0], dict): + valid_annotations = True + elif self.format == "coco_panoptic": + if isinstance(annotations, dict) and "image_id" in annotations and "segments_info" in annotations: + if isinstance(annotations["segments_info"], (list, tuple)): + # an image can have no segments (?) + if len(annotations["segments_info"]) == 0 or isinstance( + annotations["segments_info"][0], dict + ): + valid_annotations = True + else: + if isinstance(annotations, (list, tuple)): + assert len(images) == len(annotations), "There must be as many annotations as there are images" + if isinstance(annotations[0], Dict): + if self.format == "coco_detection": + if isinstance(annotations[0]["annotations"], (list, tuple)): + valid_annotations = True + elif self.format == "coco_panoptic": + if isinstance(annotations[0]["segments_info"], (list, tuple)): + valid_annotations = True + + if not valid_annotations: + raise ValueError( + """ + Annotations must of type `Dict` (single image) or `List[Dict]` (batch of images). In case of object + detection, each dictionary should contain the keys 'image_id' and 'annotations', with the latter + being a list of annotations in COCO format. In case of panoptic segmentation, each dictionary + should contain the keys 'file_name', 'image_id' and 'segments_info', with the latter being a list + of annotations in COCO format. + """ + ) + + # Check that masks_path has a valid type + if masks_path is not None: + if self.format == "coco_panoptic": + if isinstance(masks_path, pathlib.Path): + valid_masks_path = True + if not valid_masks_path: + raise ValueError( + "The path to the directory containing the mask PNG files should be provided as a `pathlib.Path` object." + ) + + if not is_batched: + images = [images] + if annotations is not None: + annotations = [annotations] + + # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) + if annotations is not None: + for idx, (image, target) in enumerate(zip(images, annotations)): + if not isinstance(image, Image.Image): + image = self.to_pil_image(image) + image, target = self.prepare(image, target, return_segmentation_masks, masks_path) + images[idx] = image + annotations[idx] = target + + # transformations (resizing + normalization) + if self.do_resize and self.size is not None: + if annotations is not None: + for idx, (image, target) in enumerate(zip(images, annotations)): + image, target = self._resize(image=image, target=target, size=self.size, max_size=self.max_size) + images[idx] = image + annotations[idx] = target + else: + for idx, image in enumerate(images): + images[idx] = self._resize(image=image, target=None, size=self.size, max_size=self.max_size)[0] + + if self.do_normalize: + if annotations is not None: + for idx, (image, target) in enumerate(zip(images, annotations)): + image, target = self._normalize( + image=image, mean=self.image_mean, std=self.image_std, target=target + ) + images[idx] = image + annotations[idx] = target + else: + images = [ + self._normalize(image=image, mean=self.image_mean, std=self.image_std)[0] for image in images + ] + + if pad_and_return_pixel_mask: + # pad images up to largest image in batch and create pixel_mask + max_size = self._max_by_axis([list(image.shape) for image in images]) + c, h, w = max_size + padded_images = [] + pixel_mask = [] + for image in images: + # create padded image + padded_image = np.zeros((c, h, w), dtype=np.float32) + padded_image[: image.shape[0], : image.shape[1], : image.shape[2]] = np.copy(image) + padded_images.append(padded_image) + # create pixel mask + mask = np.zeros((h, w), dtype=np.int64) + mask[: image.shape[1], : image.shape[2]] = True + pixel_mask.append(mask) + images = padded_images + + # return as BatchFeature + data = {} + data["pixel_values"] = images + if pad_and_return_pixel_mask: + data["pixel_mask"] = pixel_mask + encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors) + + if annotations is not None: + # Convert to TensorType + tensor_type = return_tensors + if not isinstance(tensor_type, TensorType): + tensor_type = TensorType(tensor_type) + + if not tensor_type == TensorType.PYTORCH: + raise ValueError("Only PyTorch is supported for the moment.") + else: + if not is_torch_available(): + raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.") + + encoded_inputs["target"] = [ + {k: torch.from_numpy(v) for k, v in target.items()} for target in annotations + ] + + return encoded_inputs + + def _max_by_axis(self, the_list): + # type: (List[List[int]]) -> List[int] + maxes = the_list[0] + for sublist in the_list[1:]: + for index, item in enumerate(sublist): + maxes[index] = max(maxes[index], item) + return maxes + + def pad_and_create_pixel_mask( + self, pixel_values_list: List["torch.Tensor"], return_tensors: Optional[Union[str, TensorType]] = None + ): + """ + Pad images up to the largest image in a batch and create a corresponding :obj:`pixel_mask`. + + Args: + pixel_values_list (:obj:`List[torch.Tensor]`): + List of images (pixel values) to be padded. Each image should be a tensor of shape (C, H, W). + return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`): + If set, will return tensors instead of NumPy arrays. If set to :obj:`'pt'`, return PyTorch + :obj:`torch.Tensor` objects. + + Returns: + :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields: + + - **pixel_values** -- Pixel values to be fed to a model. + - **pixel_mask** -- Pixel mask to be fed to a model (when :obj:`pad_and_return_pixel_mask=True` or if + `"pixel_mask"` is in :obj:`self.model_input_names`). + + """ + + max_size = self._max_by_axis([list(image.shape) for image in pixel_values_list]) + c, h, w = max_size + padded_images = [] + pixel_mask = [] + for image in pixel_values_list: + # create padded image + padded_image = np.zeros((c, h, w), dtype=np.float32) + padded_image[: image.shape[0], : image.shape[1], : image.shape[2]] = np.copy(image) + padded_images.append(padded_image) + # create pixel mask + mask = np.zeros((h, w), dtype=np.int64) + mask[: image.shape[1], : image.shape[2]] = True + pixel_mask.append(mask) + + # return as BatchFeature + data = {"pixel_values": padded_images, "pixel_mask": pixel_mask} + encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors) + + return encoded_inputs + + # POSTPROCESSING METHODS + # inspired by https://github.com/facebookresearch/detr/blob/master/models/detr.py#L258 + def post_process(self, outputs, target_sizes): + """ + Converts the output of :class:`~transformers.DetrForObjectDetection` into the format expected by the COCO api. + Only supports PyTorch. + + Args: + outputs (:class:`~transformers.DetrObjectDetectionOutput`): + Raw outputs of the model. + target_sizes (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)`, `optional`): + Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original + image size (before any data augmentation). For visualization, this should be the image size after data + augment, but before padding. + + Returns: + :obj:`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an + image in the batch as predicted by the model. + """ + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + + assert len(out_logits) == len( + target_sizes + ), "Make sure that you pass in as many target sizes as the batch dimension of the logits" + assert ( + target_sizes.shape[1] == 2 + ), "Each element of target_sizes must contain the size (h, w) of each image of the batch" + + prob = F.softmax(out_logits, -1) + scores, labels = prob[..., :-1].max(-1) + + # convert to [x0, y0, x1, y1] format + boxes = center_to_corners_format(out_bbox) + # and from relative [0, 1] to absolute [0, height] coordinates + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) + boxes = boxes * scale_fct[:, None, :] + + results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] + + return results + + # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218 + def post_process_segmentation(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5): + """ + Converts the output of :class:`~transformers.DetrForSegmentation` into actual instance segmentation + predictions. Only supports PyTorch. + + Args: + results (:obj:`List[Dict]`): + Results list obtained by :meth:`~transformers.DetrFeatureExtractor.post_process`, to which "masks" + results will be added. + outputs (:class:`~transformers.DetrSegmentationOutput`): + Raw outputs of the model. + orig_target_sizes (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)`): + Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original + image size (before any data augmentation). + max_target_sizes (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)`): + Tensor containing the maximum size (h, w) of each image of the batch. For evaluation, this must be the + original image size (before any data augmentation). + threshold (:obj:`float`, `optional`, defaults to 0.5): + Threshold to use when turning the predicted masks into binary values. + + Returns: + :obj:`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks + for an image in the batch as predicted by the model. + """ + + assert len(orig_target_sizes) == len( + max_target_sizes + ), "Make sure to pass in as many orig_target_sizes as max_target_sizes" + max_h, max_w = max_target_sizes.max(0)[0].tolist() + outputs_masks = outputs.pred_masks.squeeze(2) + outputs_masks = F.interpolate(outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False) + outputs_masks = (outputs_masks.sigmoid() > threshold).cpu() + + for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)): + img_h, img_w = t[0], t[1] + results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1) + results[i]["masks"] = F.interpolate( + results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest" + ).byte() + + return results + + # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241 + def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85): + """ + Converts the output of :class:`~transformers.DetrForSegmentation` into actual panoptic predictions. Only + supports PyTorch. + + Parameters: + outputs (:class:`~transformers.DetrSegmentationOutput`): + Raw outputs of the model. + processed_sizes (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)` or :obj:`List[Tuple]` of length :obj:`batch_size`): + Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data + augmentation but before batching. + target_sizes (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)` or :obj:`List[Tuple]` of length :obj:`batch_size`, `optional`): + Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction. If left to + None, it will default to the :obj:`processed_sizes`. + is_thing_map (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)`, `optional`): + Dictionary mapping class indices to either True or False, depending on whether or not they are a thing. + If not set, defaults to the :obj:`is_thing_map` of COCO panoptic. + threshold (:obj:`float`, `optional`, defaults to 0.85): + Threshold to use to filter out queries. + + Returns: + :obj:`List[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values + for an image in the batch as predicted by the model. + """ + if target_sizes is None: + target_sizes = processed_sizes + assert len(processed_sizes) == len( + target_sizes + ), "Make sure to pass in as many processed_sizes as target_sizes" + + if is_thing_map is None: + # default to is_thing_map of COCO panoptic + is_thing_map = {i: i <= 90 for i in range(201)} + + out_logits, raw_masks, raw_boxes = outputs.logits, outputs.pred_masks, outputs.pred_boxes + assert ( + len(out_logits) == len(raw_masks) == len(target_sizes) + ), "Make sure that you pass in as many target sizes as the batch dimension of the logits and masks" + preds = [] + + def to_tuple(tup): + if isinstance(tup, tuple): + return tup + return tuple(tup.cpu().tolist()) + + for cur_logits, cur_masks, cur_boxes, size, target_size in zip( + out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes + ): + # we filter empty queries and detection below threshold + scores, labels = cur_logits.softmax(-1).max(-1) + keep = labels.ne(outputs.logits.shape[-1] - 1) & (scores > threshold) + cur_scores, cur_classes = cur_logits.softmax(-1).max(-1) + cur_scores = cur_scores[keep] + cur_classes = cur_classes[keep] + cur_masks = cur_masks[keep] + cur_masks = F.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1) + cur_boxes = center_to_corners_format(cur_boxes[keep]) + + h, w = cur_masks.shape[-2:] + assert len(cur_boxes) == len(cur_classes), "Not as many boxes as there are classes" + + # It may be that we have several predicted masks for the same stuff class. + # In the following, we track the list of masks ids for each stuff class (they are merged later on) + cur_masks = cur_masks.flatten(1) + stuff_equiv_classes = defaultdict(lambda: []) + for k, label in enumerate(cur_classes): + if not is_thing_map[label.item()]: + stuff_equiv_classes[label.item()].append(k) + + def get_ids_area(masks, scores, dedup=False): + # This helper function creates the final panoptic segmentation image + # It also returns the area of the masks that appears on the image + + m_id = masks.transpose(0, 1).softmax(-1) + + if m_id.shape[-1] == 0: + # We didn't detect any mask :( + m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device) + else: + m_id = m_id.argmax(-1).view(h, w) + + if dedup: + # Merge the masks corresponding to the same stuff class + for equiv in stuff_equiv_classes.values(): + if len(equiv) > 1: + for eq_id in equiv: + m_id.masked_fill_(m_id.eq(eq_id), equiv[0]) + + final_h, final_w = to_tuple(target_size) + + seg_img = Image.fromarray(id_to_rgb(m_id.view(h, w).cpu().numpy())) + seg_img = seg_img.resize(size=(final_w, final_h), resample=Image.NEAREST) + + np_seg_img = torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes())) + np_seg_img = np_seg_img.view(final_h, final_w, 3) + np_seg_img = np_seg_img.numpy() + + m_id = torch.from_numpy(rgb_to_id(np_seg_img)) + + area = [] + for i in range(len(scores)): + area.append(m_id.eq(i).sum().item()) + return area, seg_img + + area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True) + if cur_classes.numel() > 0: + # We know filter empty masks as long as we find some + while True: + filtered_small = torch.as_tensor( + [area[i] <= 4 for i, c in enumerate(cur_classes)], dtype=torch.bool, device=keep.device + ) + if filtered_small.any().item(): + cur_scores = cur_scores[~filtered_small] + cur_classes = cur_classes[~filtered_small] + cur_masks = cur_masks[~filtered_small] + area, seg_img = get_ids_area(cur_masks, cur_scores) + else: + break + + else: + cur_classes = torch.ones(1, dtype=torch.long, device=cur_classes.device) + + segments_info = [] + for i, a in enumerate(area): + cat = cur_classes[i].item() + segments_info.append({"id": i, "isthing": is_thing_map[cat], "category_id": cat, "area": a}) + del cur_classes + + with io.BytesIO() as out: + seg_img.save(out, format="PNG") + predictions = {"png_string": out.getvalue(), "segments_info": segments_info} + preds.append(predictions) + return preds diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py new file mode 100644 index 00000000000..b876e1be8fe --- /dev/null +++ b/src/transformers/models/detr/modeling_detr.py @@ -0,0 +1,2267 @@ +# coding=utf-8 +# Copyright 2021 Facebook AI Research The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch DETR model. """ + + +import math +import random +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple + +import torch +import torch.nn.functional as F +from torch import Tensor, nn + +from ...activations import ACT2FN +from ...file_utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_scipy_available, + is_timm_available, + is_vision_available, + replace_return_docstrings, + requires_backends, +) +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput +from ...modeling_utils import PreTrainedModel +from ...utils import logging +from .configuration_detr import DetrConfig + + +if is_scipy_available(): + from scipy.optimize import linear_sum_assignment + +if is_vision_available(): + from .feature_extraction_detr import center_to_corners_format + +if is_timm_available(): + from timm import create_model + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "DetrConfig" + +DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "facebook/detr-resnet-50", + # See all DETR models at https://huggingface.co/models?filter=detr +] + + +@dataclass +class DetrDecoderOutput(BaseModelOutputWithCrossAttentions): + """ + Base class for outputs of the DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions, + namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them + gone through a layernorm. This is useful when training the model with auxiliary decoding losses. + + Args: + last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of + each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the + weighted average in the self-attention heads. + cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` and ``config.add_cross_attention=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the + attention softmax, used to compute the weighted average in the cross-attention heads. + intermediate_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(config.decoder_layers, batch_size, num_queries, hidden_size)`, `optional`, returned when ``config.auxiliary_loss=True``): + Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a + layernorm. + """ + + intermediate_hidden_states: Optional[torch.FloatTensor] = None + + +@dataclass +class DetrModelOutput(Seq2SeqModelOutput): + """ + Base class for outputs of the DETR encoder-decoder model. This class adds one attribute to Seq2SeqModelOutput, + namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them + gone through a layernorm. This is useful when training the model with auxiliary decoding losses. + + Args: + last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. If + :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, + hidden_size)` is output. + decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of + each layer plus the initial embedding outputs. + decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to + compute the weighted average in the self-attention heads. + cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the + attention softmax, used to compute the weighted average in the cross-attention heads. + encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of + each layer plus the initial embedding outputs. + encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to + compute the weighted average in the self-attention heads. + intermediate_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(config.decoder_layers, batch_size, sequence_length, hidden_size)`, `optional`, returned when ``config.auxiliary_loss=True``): + Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a + layernorm. + """ + + intermediate_hidden_states: Optional[torch.FloatTensor] = None + + +@dataclass +class DetrObjectDetectionOutput(ModelOutput): + """ + Output type of :class:`~transformers.DetrForObjectDetection`. + + Args: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` are provided)): + Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a + bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized + scale-invariant IoU loss. + loss_dict (:obj:`Dict`, `optional`): + A dictionary containing the individual losses. Useful for logging. + logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, num_classes + 1)`): + Classification logits (including no-object) for all queries. + pred_boxes (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding + possible padding). You can use :class:`~transformers.DetrForObjectDetection.post_process` to retrieve the + unnormalized bounding boxes. + auxiliary_outputs (:obj:`list[Dict]`, `optional`): + Optional, only returned when auxilary losses are activated (i.e. :obj:`config.auxiliary_loss` is set to + `True`) and labels are provided. It is a list of dictionnaries containing the two above keys (:obj:`logits` + and :obj:`pred_boxes`) for each decoder layer. + last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + + If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, + 1, hidden_size)` is output. + decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of + each layer plus the initial embedding outputs. + decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to + compute the weighted average in the self-attention heads. + cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the + attention softmax, used to compute the weighted average in the cross-attention heads. + encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of + each layer plus the initial embedding outputs. + encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to + compute the weighted average in the self-attention heads. + """ + + loss: Optional[torch.FloatTensor] = None + loss_dict: Optional[Dict] = None + logits: torch.FloatTensor = None + pred_boxes: torch.FloatTensor = None + auxiliary_outputs: Optional[List[Dict]] = None + last_hidden_state: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class DetrSegmentationOutput(ModelOutput): + """ + Output type of :class:`~transformers.DetrForSegmentation`. + + Args: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` are provided)): + Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a + bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized + scale-invariant IoU loss. + loss_dict (:obj:`Dict`, `optional`): + A dictionary containing the individual losses. Useful for logging. + logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, num_classes + 1)`): + Classification logits (including no-object) for all queries. + pred_boxes (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding + possible padding). You can use :meth:`~transformers.DetrForObjectDetection.post_process` to retrieve the + unnormalized bounding boxes. + pred_masks (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, width, height)`): + Segmentation masks for all queries. See also + :meth:`~transformers.DetrFeatureExtractor.post_process_segmentation` or + :meth:`~transformers.DetrFeatureExtractor.post_process_panoptic` to evaluate instance and panoptic + segmentation masks respectively. + auxiliary_outputs (:obj:`list[Dict]`, `optional`): + Optional, only returned when auxilary losses are activated (i.e. :obj:`config.auxiliary_loss` is set to + `True`) and labels are provided. It is a list of dictionnaries containing the two above keys (:obj:`logits` + and :obj:`pred_boxes`) for each decoder layer. + last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + + If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, + 1, hidden_size)` is output. + decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of + each layer plus the initial embedding outputs. + decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to + compute the weighted average in the self-attention heads. + cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the + attention softmax, used to compute the weighted average in the cross-attention heads. + encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of + each layer plus the initial embedding outputs. + encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to + compute the weighted average in the self-attention heads. + """ + + loss: Optional[torch.FloatTensor] = None + loss_dict: Optional[Dict] = None + logits: torch.FloatTensor = None + pred_boxes: torch.FloatTensor = None + pred_masks: torch.FloatTensor = None + auxiliary_outputs: Optional[List[Dict]] = None + last_hidden_state: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + + +# BELOW: utilities copied from +# https://github.com/facebookresearch/detr/blob/master/backbone.py +class DetrFrozenBatchNorm2d(nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed. + + Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than + torchvision.models.resnet[18,34,50,101] produce nans. + """ + + def __init__(self, n): + super(DetrFrozenBatchNorm2d, self).__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + num_batches_tracked_key = prefix + "num_batches_tracked" + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super(DetrFrozenBatchNorm2d, self)._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + + def forward(self, x): + # move reshapes to the beginning + # to make it user-friendly + weight = self.weight.reshape(1, -1, 1, 1) + bias = self.bias.reshape(1, -1, 1, 1) + running_var = self.running_var.reshape(1, -1, 1, 1) + running_mean = self.running_mean.reshape(1, -1, 1, 1) + epsilon = 1e-5 + scale = weight * (running_var + epsilon).rsqrt() + bias = bias - running_mean * scale + return x * scale + bias + + +def replace_batch_norm(m, name=""): + for attr_str in dir(m): + target_attr = getattr(m, attr_str) + if isinstance(target_attr, torch.nn.BatchNorm2d): + frozen = DetrFrozenBatchNorm2d(target_attr.num_features) + bn = getattr(m, attr_str) + frozen.weight.data.copy_(bn.weight) + frozen.bias.data.copy_(bn.bias) + frozen.running_mean.data.copy_(bn.running_mean) + frozen.running_var.data.copy_(bn.running_var) + setattr(m, attr_str, frozen) + for n, ch in m.named_children(): + replace_batch_norm(ch, n) + + +class DetrTimmConvEncoder(nn.Module): + """ + Convolutional encoder (backbone) from the timm library. + + nn.BatchNorm2d layers are replaced by DetrFrozenBatchNorm2d as defined above. + + """ + + def __init__(self, name: str, dilation: bool): + super().__init__() + + kwargs = {} + if dilation: + kwargs["output_stride"] = 16 + + requires_backends(self, ["timm"]) + + backbone = create_model(name, pretrained=True, features_only=True, out_indices=(1, 2, 3, 4), **kwargs) + # replace batch norm by frozen batch norm + with torch.no_grad(): + replace_batch_norm(backbone) + self.model = backbone + self.intermediate_channel_sizes = self.model.feature_info.channels() + + if "resnet" in name: + for name, parameter in self.model.named_parameters(): + if "layer2" not in name and "layer3" not in name and "layer4" not in name: + parameter.requires_grad_(False) + + def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): + # send pixel_values through the model to get list of feature maps + features = self.model(pixel_values) + + out = [] + for feature_map in features: + # downsample pixel_mask to match shape of corresponding feature_map + mask = F.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0] + out.append((feature_map, mask)) + return out + + +class DetrConvModel(nn.Module): + """ + This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder. + """ + + def __init__(self, conv_encoder, position_embedding): + super().__init__() + self.conv_encoder = conv_encoder + self.position_embedding = position_embedding + + def forward(self, pixel_values, pixel_mask): + # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples + out = self.conv_encoder(pixel_values, pixel_mask) + pos = [] + for feature_map, mask in out: + # position encoding + pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype)) + + return out, pos + + +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min) + + +class DetrSinePositionEmbedding(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one used by the Attention is all you + need paper, generalized to work on images. + """ + + def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None): + super().__init__() + self.embedding_dim = embedding_dim + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, pixel_values, pixel_mask): + assert pixel_mask is not None, "No pixel mask provided" + y_embed = pixel_mask.cumsum(1, dtype=torch.float32) + x_embed = pixel_mask.cumsum(2, dtype=torch.float32) + if self.normalize: + y_embed = y_embed / (y_embed[:, -1:, :] + 1e-6) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale + + dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device) + dim_t = self.temperature ** (2 * (dim_t // 2) / self.embedding_dim) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + +class DetrLearnedPositionEmbedding(nn.Module): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, embedding_dim=256): + super().__init__() + self.row_embeddings = nn.Embedding(50, embedding_dim) + self.column_embeddings = nn.Embedding(50, embedding_dim) + + def forward(self, pixel_values, pixel_mask=None): + h, w = pixel_values.shape[-2:] + i = torch.arange(w, device=pixel_values.device) + j = torch.arange(h, device=pixel_values.device) + x_emb = self.column_embeddings(i) + y_emb = self.row_embeddings(j) + pos = torch.cat([x_emb.unsqueeze(0).repeat(h, 1, 1), y_emb.unsqueeze(1).repeat(1, w, 1)], dim=-1) + pos = pos.permute(2, 0, 1) + pos = pos.unsqueeze(0) + pos = pos.repeat(pixel_values.shape[0], 1, 1, 1) + return pos + + +def build_position_encoding(config): + n_steps = config.d_model // 2 + if config.position_embedding_type == "sine": + # TODO find a better way of exposing other arguments + position_embedding = DetrSinePositionEmbedding(n_steps, normalize=True) + elif config.position_embedding_type == "learned": + position_embedding = DetrLearnedPositionEmbedding(n_steps) + else: + raise ValueError(f"Not supported {config.position_embedding_type}") + + return position_embedding + + +class DetrAttention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. + + Here, we add position embeddings to the queries and keys (as explained in the DETR paper). + """ + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + assert ( + self.head_dim * num_heads == self.embed_dim + ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})." + self.scaling = self.head_dim ** -0.5 + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[torch.Tensor] = None, + key_value_states: Optional[torch.Tensor] = None, + key_value_position_embeddings: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + bsz, tgt_len, embed_dim = hidden_states.size() + + # add position embeddings to the hidden states before projecting to queries and keys + if position_embeddings is not None: + hidden_states_original = hidden_states + hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + + # add key-value position embeddings to the key value states + if key_value_position_embeddings is not None: + key_value_states_original = key_value_states + key_value_states = self.with_pos_embed(key_value_states, key_value_position_embeddings) + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + if is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states_original), -1, bsz) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states_original), -1, bsz) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + src_len = key_states.size(1) + + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = F.softmax(attn_weights, dim=-1) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped + + +class DetrEncoderLayer(nn.Module): + def __init__(self, config: DetrConfig): + super().__init__() + self.embed_dim = config.d_model + self.self_attn = DetrAttention( + embed_dim=self.embed_dim, + num_heads=config.encoder_attention_heads, + dropout=config.attention_dropout, + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + position_embeddings: torch.Tensor = None, + output_attentions: bool = False, + ): + """ + Args: + hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape :obj:`(seq_len, batch, embed_dim)` + attention_mask (:obj:`torch.FloatTensor`): attention mask of size + :obj:`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + position_embeddings (:obj:`torch.FloatTensor`, `optional`): position embeddings, to be added to hidden_states. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_embeddings=position_embeddings, + output_attentions=output_attentions, + ) + + hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + + hidden_states = self.fc2(hidden_states) + hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class DetrDecoderLayer(nn.Module): + def __init__(self, config: DetrConfig): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = DetrAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.encoder_attn = DetrAttention( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[torch.Tensor] = None, + query_position_embeddings: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ): + """ + Args: + hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape :obj:`(seq_len, batch, embed_dim)` + attention_mask (:obj:`torch.FloatTensor`): attention mask of size + :obj:`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + position_embeddings (:obj:`torch.FloatTensor`, `optional`): position embeddings that are added to the queries and keys + in the cross-attention layer. + query_position_embeddings (:obj:`torch.FloatTensor`, `optional`): position embeddings that are added to the queries and keys + in the self-attention layer. + encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape :obj:`(seq_len, batch, embed_dim)` + encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size + :obj:`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under + returned tensors for more detail. + """ + residual = hidden_states + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + position_embeddings=query_position_embeddings, + attention_mask=attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Cross-Attention Block + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + position_embeddings=query_position_embeddings, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + key_value_position_embeddings=position_embeddings, + output_attentions=output_attentions, + ) + + hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +class DetrClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float): + super().__init__() + self.dense = nn.Linear(input_dim, inner_dim) + self.dropout = nn.Dropout(p=pooler_dropout) + self.out_proj = nn.Linear(inner_dim, num_classes) + + def forward(self, hidden_states: torch.Tensor): + hidden_states = self.dropout(hidden_states) + hidden_states = self.dense(hidden_states) + hidden_states = torch.tanh(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.out_proj(hidden_states) + return hidden_states + + +class DetrPreTrainedModel(PreTrainedModel): + config_class = DetrConfig + base_model_prefix = "model" + + def _init_weights(self, module): + std = self.config.init_std + xavier_std = self.config.init_xavier_std + + if isinstance(module, DetrMHAttentionMap): + nn.init.zeros_(module.k_linear.bias) + nn.init.zeros_(module.q_linear.bias) + nn.init.xavier_uniform_(module.k_linear.weight, gain=xavier_std) + nn.init.xavier_uniform_(module.q_linear.weight, gain=xavier_std) + elif isinstance(module, DetrLearnedPositionEmbedding): + nn.init.uniform_(module.row_embeddings.weight) + nn.init.uniform_(module.column_embeddings.weight) + if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +DETR_START_DOCSTRING = r""" + This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + + This model is also a PyTorch `torch.nn.Module `__ + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + + Parameters: + config (:class:`~transformers.DetrConfig`): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. +""" + +DETR_INPUTS_DOCSTRING = r""" + Args: + pixel_values (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. + + Pixel values can be obtained using :class:`~transformers.DetrTokenizer`. See + :meth:`transformers.DetrTokenizer.__call__` for details. + + pixel_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, height, width)`, `optional`): + Mask to avoid performing attention on padding pixel values. Mask values selected in ``[0, 1]``: + + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + + `What are attention masks? <../glossary.html#attention-mask>`__ + + decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_queries)`, `optional`): + Not used by default. Can be used to mask object queries. + encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`): + Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`: + :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, + `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the + cross-attention of the decoder. + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you + can choose to directly pass a flattened representation of an image. + decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, hidden_size)`, `optional`): + Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an + embedded representation. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. +""" + + +class DetrEncoder(DetrPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + :class:`DetrEncoderLayer`. + + The encoder updates the flattened feature map through multiple self-attention layers. + + Small tweak for DETR: + + - position_embeddings are added to the forward pass. + + Args: + config: DetrConfig + embed_tokens (torch.nn.Embedding): output embedding + """ + + def __init__(self, config: DetrConfig): + super().__init__(config) + + self.dropout = config.dropout + self.layerdrop = config.encoder_layerdrop + + embed_dim = config.d_model + self.padding_idx = config.pad_token_id + self.max_source_positions = config.max_position_embeddings + self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 + + self.layers = nn.ModuleList([DetrEncoderLayer(config) for _ in range(config.encoder_layers)]) + + # in the original DETR, no layernorm is used for the Encoder, as "normalize_before" is set to False by default there + + self.init_weights() + + def forward( + self, + inputs_embeds=None, + attention_mask=None, + position_embeddings=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Flattened feature map (output of the backbone + projection layer) that is passed to the encoder. + + attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding pixel features. Mask values selected in ``[0, 1]``: + + - 1 for pixel features that are real (i.e. **not masked**), + - 0 for pixel features that are padding (i.e. **masked**). + + `What are attention masks? <../glossary.html#attention-mask>`__ + + position_embeddings (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Position embeddings that are added to the queries and keys in each self-attention layer. + + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under + returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors + for more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + hidden_states = inputs_embeds + hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + + # expand attention_mask + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + for i, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + dropout_probability = random.uniform(0, 1) + if self.training and (dropout_probability < self.layerdrop): # skip the layer + layer_outputs = (None, None) + else: + # we add position_embeddings as extra input to the encoder_layer + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + position_embeddings=position_embeddings, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class DetrDecoder(DetrPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`DetrDecoderLayer`. + + The decoder updates the query embeddings through multiple self-attention and cross-attention layers. + + Some small tweaks for DETR: + + - position_embeddings and query_position_embeddings are added to the forward pass. + - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers. + + Args: + config: DetrConfig + embed_tokens (torch.nn.Embedding): output embedding + """ + + def __init__(self, config: DetrConfig, embed_tokens: Optional[nn.Embedding] = None): + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.decoder_layerdrop + self.padding_idx = config.pad_token_id + self.max_target_positions = config.max_position_embeddings + self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 + + self.layers = nn.ModuleList([DetrDecoderLayer(config) for _ in range(config.decoder_layers)]) + # in DETR, the decoder uses layernorm after the last decoder layer output + self.layernorm = nn.LayerNorm(config.d_model) + + self.init_weights() + + def forward( + self, + inputs_embeds=None, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + position_embeddings=None, + query_position_embeddings=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + The query embeddings that are passed into the decoder. + + attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on certain queries. Mask values selected in ``[0, 1]``: + + - 1 for queries that are **not masked**, + - 0 for queries that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`): + Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected + in ``[0, 1]``: + + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + + position_embeddings (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Position embeddings that are added to the queries and keys in each cross-attention layer. + query_position_embeddings (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, hidden_size)`):, `optional`): + Position embeddings that are added to the queries and keys in each self-attention layer. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under + returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors + for more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is not None: + hidden_states = inputs_embeds + input_shape = inputs_embeds.size()[:-1] + + combined_attention_mask = None + + if attention_mask is not None and combined_attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = combined_attention_mask + _expand_mask( + attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]) + + # optional intermediate hidden states + intermediate = () if self.config.auxiliary_loss else None + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + dropout_probability = random.uniform(0, 1) + if self.training and (dropout_probability < self.layerdrop): + continue + + if getattr(self.config, "gradient_checkpointing", False) and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + combined_attention_mask, + encoder_hidden_states, + encoder_attention_mask, + None, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=combined_attention_mask, + position_embeddings=position_embeddings, + query_position_embeddings=query_position_embeddings, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if self.config.auxiliary_loss: + hidden_states = self.layernorm(hidden_states) + intermediate += (hidden_states,) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # finally, apply layernorm + hidden_states = self.layernorm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + # stack intermediate decoder activations + if self.config.auxiliary_loss: + intermediate = torch.stack(intermediate) + + if not return_dict: + return tuple( + v + for v in [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions, intermediate] + if v is not None + ) + return DetrDecoderOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + intermediate_hidden_states=intermediate, + ) + + +@add_start_docstrings( + """ + The bare DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without + any specific head on top. + """, + DETR_START_DOCSTRING, +) +class DetrModel(DetrPreTrainedModel): + def __init__(self, config: DetrConfig): + super().__init__(config) + + # Create backbone + positional encoding + backbone = DetrTimmConvEncoder(config.backbone, config.dilation) + position_embeddings = build_position_encoding(config) + self.backbone = DetrConvModel(backbone, position_embeddings) + + # Create projection layer + self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1) + + self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model) + + self.encoder = DetrEncoder(config) + self.decoder = DetrDecoder(config) + + self.init_weights() + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + def freeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(False) + + def unfreeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(True) + + @add_start_docstrings_to_model_forward(DETR_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=DetrModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values, + pixel_mask=None, + decoder_attention_mask=None, + encoder_outputs=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Returns: + + Examples:: + + >>> from transformers import DetrFeatureExtractor, DetrModel + >>> from PIL import Image + >>> import requests + + >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> feature_extractor = DetrFeatureExtractor.from_pretrained('facebook/detr-resnet-50') + >>> model = DetrModel.from_pretrained('facebook/detr-resnet-50') + >>> inputs = feature_extractor(images=image, return_tensors="pt") + >>> outputs = model(**inputs) + >>> last_hidden_states = outputs.last_hidden_state + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, num_channels, height, width = pixel_values.shape + device = pixel_values.device + + if pixel_mask is None: + pixel_mask = torch.ones(((batch_size, height, width)), device=device) + + # First, sent pixel_values + pixel_mask through Backbone to obtain the features + # pixel_values should be of shape (batch_size, num_channels, height, width) + # pixel_mask should be of shape (batch_size, height, width) + features, position_embeddings_list = self.backbone(pixel_values, pixel_mask) + + # get final feature map and downsampled mask + feature_map, mask = features[-1] + + assert mask is not None, "Backbone does not return downsampled pixel mask" + + # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) + projected_feature_map = self.input_projection(feature_map) + + # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC + # In other words, turn their shape into (batch_size, sequence_length, hidden_size) + flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1) + position_embeddings = position_embeddings_list[-1].flatten(2).permute(0, 2, 1) + + flattened_mask = mask.flatten(1) + + # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder + # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size) + # flattened_mask is a Tensor of shape (batch_size, heigth*width) + if encoder_outputs is None: + encoder_outputs = self.encoder( + inputs_embeds=flattened_features, + attention_mask=flattened_mask, + position_embeddings=position_embeddings, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # Fifth, sent query embeddings + position embeddings through the decoder (which is conditioned on the encoder output) + query_position_embeddings = self.query_position_embeddings.weight.unsqueeze(0).repeat(batch_size, 1, 1) + queries = torch.zeros_like(query_position_embeddings) + + # decoder outputs consists of (dec_features, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + inputs_embeds=queries, + attention_mask=None, + position_embeddings=position_embeddings, + query_position_embeddings=query_position_embeddings, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=flattened_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return DetrModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + intermediate_hidden_states=decoder_outputs.intermediate_hidden_states, + ) + + +@add_start_docstrings( + """ + DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks + such as COCO detection. + """, + DETR_START_DOCSTRING, +) +class DetrForObjectDetection(DetrPreTrainedModel): + def __init__(self, config: DetrConfig): + super().__init__(config) + + # DETR encoder-decoder model + self.model = DetrModel(config) + + # Object detection heads + self.class_labels_classifier = nn.Linear( + config.d_model, config.num_labels + 1 + ) # We add one for the "no object" class + self.bbox_predictor = DetrMLPPredictionHead( + input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 + ) + + self.init_weights() + + # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_coord): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] + + @add_start_docstrings_to_model_forward(DETR_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=DetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values, + pixel_mask=None, + decoder_attention_mask=None, + encoder_outputs=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`List[Dict]` of len :obj:`(batch_size,)`, `optional`): + Labels for computing the bipartite matching loss. List of dicts, each dictionary containing 2 keys: + 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch respectively). The + class labels themselves should be a :obj:`torch.LongTensor` of len :obj:`(number of bounding boxes in the + image,)` and the boxes a :obj:`torch.FloatTensor` of shape :obj:`(number of bounding boxes in the image, + 4)`. + + Returns: + + Examples:: + + >>> from transformers import DetrFeatureExtractor, DetrForObjectDetection + >>> from PIL import Image + >>> import requests + + >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> feature_extractor = DetrFeatureExtractor.from_pretrained('facebook/detr-resnet-50') + >>> model = DetrForObjectDetection.from_pretrained('facebook/detr-resnet-50') + + >>> inputs = feature_extractor(images=image, return_tensors="pt") + >>> outputs = model(**inputs) + >>> # model predicts bounding boxes and corresponding COCO classes + >>> logits = outputs.logits + >>> bboxes = outputs.pred_boxes + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # First, sent images through DETR base model to obtain encoder + decoder outputs + outputs = self.model( + pixel_values, + pixel_mask=pixel_mask, + decoder_attention_mask=decoder_attention_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + # class logits + predicted bounding boxes + logits = self.class_labels_classifier(sequence_output) + pred_boxes = self.bbox_predictor(sequence_output).sigmoid() + + loss, loss_dict, auxiliary_outputs = None, None, None + if labels is not None: + # First: create the matcher + matcher = DetrHungarianMatcher( + class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost + ) + # Second: create the criterion + losses = ["labels", "boxes", "cardinality"] + criterion = DetrLoss( + matcher=matcher, + num_classes=self.config.num_labels, + eos_coef=self.config.eos_coefficient, + losses=losses, + ) + criterion.to(self.device) + # Third: compute the losses, based on outputs and labels + outputs_loss = {} + outputs_loss["logits"] = logits + outputs_loss["pred_boxes"] = pred_boxes + if self.config.auxiliary_loss: + intermediate = outputs.intermediate_hidden_states if return_dict else outputs[4] + outputs_class = self.class_labels_classifier(intermediate) + outputs_coord = self.bbox_predictor(intermediate).sigmoid() + auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord) + outputs_loss["auxiliary_outputs"] = auxiliary_outputs + + loss_dict = criterion(outputs_loss, labels) + # Fourth: compute total loss, as a weighted sum of the various losses + weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient} + weight_dict["loss_giou"] = self.config.giou_loss_coefficient + if self.config.auxiliary_loss: + aux_weight_dict = {} + for i in range(self.config.decoder_layers - 1): + aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) + weight_dict.update(aux_weight_dict) + loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) + + if not return_dict: + if auxiliary_outputs is not None: + output = (logits, pred_boxes) + auxiliary_outputs + outputs + else: + output = (logits, pred_boxes) + outputs + return ((loss, loss_dict) + output) if loss is not None else output + + return DetrObjectDetectionOutput( + loss=loss, + loss_dict=loss_dict, + logits=logits, + pred_boxes=pred_boxes, + auxiliary_outputs=auxiliary_outputs, + last_hidden_state=outputs.last_hidden_state, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +@add_start_docstrings( + """ + DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top, for tasks + such as COCO panoptic. + + """, + DETR_START_DOCSTRING, +) +class DetrForSegmentation(DetrPreTrainedModel): + def __init__(self, config: DetrConfig): + super().__init__(config) + + # object detection model + self.detr = DetrForObjectDetection(config) + + # segmentation head + hidden_size, number_of_heads = config.d_model, config.encoder_attention_heads + intermediate_channel_sizes = self.detr.model.backbone.conv_encoder.intermediate_channel_sizes + + self.mask_head = DetrMaskHeadSmallConv( + hidden_size + number_of_heads, intermediate_channel_sizes[::-1][-3:], hidden_size + ) + + self.bbox_attention = DetrMHAttentionMap( + hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std + ) + + self.init_weights() + + @add_start_docstrings_to_model_forward(DETR_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=DetrSegmentationOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values, + pixel_mask=None, + decoder_attention_mask=None, + encoder_outputs=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`List[Dict]` of len :obj:`(batch_size,)`, `optional`): + Labels for computing the bipartite matching loss. List of dicts, each dictionary containing 3 keys: + 'class_labels', 'boxes' and 'masks' (the class labels, bounding boxes and segmentation masks of an image in + the batch respectively). The class labels themselves should be a :obj:`torch.LongTensor` of len + :obj:`(number of bounding boxes in the image,)`, the boxes a :obj:`torch.FloatTensor` of shape + :obj:`(number of bounding boxes in the image, 4)` and the masks a :obj:`torch.FloatTensor` of shape + :obj:`(number of bounding boxes in the image, 4)`. + + Returns: + + Examples:: + + >>> from transformers import DetrFeatureExtractor, DetrForSegmentation + >>> from PIL import Image + >>> import requests + + >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> feature_extractor = DetrFeatureExtractor.from_pretrained('facebook/detr-resnet-50-panoptic') + >>> model = DetrForSegmentation.from_pretrained('facebook/detr-resnet-50-panoptic') + + >>> inputs = feature_extractor(images=image, return_tensors="pt") + >>> outputs = model(**inputs) + >>> # model predicts COCO classes, bounding boxes, and masks + >>> logits = outputs.logits + >>> bboxes = outputs.pred_boxes + >>> masks = outputs.pred_masks + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, num_channels, height, width = pixel_values.shape + device = pixel_values.device + + if pixel_mask is None: + pixel_mask = torch.ones((batch_size, height, width), device=device) + + # First, get list of feature maps and position embeddings + features, position_embeddings_list = self.detr.model.backbone(pixel_values, pixel_mask=pixel_mask) + + # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) + feature_map, mask = features[-1] + batch_size, num_channels, height, width = feature_map.shape + projected_feature_map = self.detr.model.input_projection(feature_map) + + # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC + # In other words, turn their shape into (batch_size, sequence_length, hidden_size) + flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1) + position_embeddings = position_embeddings_list[-1].flatten(2).permute(0, 2, 1) + + flattened_mask = mask.flatten(1) + + # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder + # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size) + # flattened_mask is a Tensor of shape (batch_size, heigth*width) + if encoder_outputs is None: + encoder_outputs = self.detr.model.encoder( + inputs_embeds=flattened_features, + attention_mask=flattened_mask, + position_embeddings=position_embeddings, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # Fifth, sent query embeddings + position embeddings through the decoder (which is conditioned on the encoder output) + query_position_embeddings = self.detr.model.query_position_embeddings.weight.unsqueeze(0).repeat( + batch_size, 1, 1 + ) + queries = torch.zeros_like(query_position_embeddings) + + # decoder outputs consists of (dec_features, dec_hidden, dec_attn) + decoder_outputs = self.detr.model.decoder( + inputs_embeds=queries, + attention_mask=None, + position_embeddings=position_embeddings, + query_position_embeddings=query_position_embeddings, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=flattened_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = decoder_outputs[0] + + # Sixth, compute logits, pred_boxes and pred_masks + logits = self.detr.class_labels_classifier(sequence_output) + pred_boxes = self.detr.bbox_predictor(sequence_output).sigmoid() + + memory = encoder_outputs[0].permute(0, 2, 1).view(batch_size, self.config.d_model, height, width) + mask = flattened_mask.view(batch_size, height, width) + + # FIXME h_boxes takes the last one computed, keep this in mind + # important: we need to reverse the mask, since in the original implementation the mask works reversed + # bbox_mask is of shape (batch_size, num_queries, number_of_attention_heads in bbox_attention, height/32, width/32) + bbox_mask = self.bbox_attention(sequence_output, memory, mask=~mask) + + seg_masks = self.mask_head(projected_feature_map, bbox_mask, [features[2][0], features[1][0], features[0][0]]) + + pred_masks = seg_masks.view(batch_size, self.detr.config.num_queries, seg_masks.shape[-2], seg_masks.shape[-1]) + + loss, loss_dict, auxiliary_outputs = None, None, None + if labels is not None: + # First: create the matcher + matcher = DetrHungarianMatcher( + class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost + ) + # Second: create the criterion + losses = ["labels", "boxes", "cardinality", "masks"] + criterion = DetrLoss( + matcher=matcher, + num_classes=self.config.num_labels, + eos_coef=self.config.eos_coefficient, + losses=losses, + ) + criterion.to(self.device) + # Third: compute the losses, based on outputs and labels + outputs_loss = {} + outputs_loss["logits"] = logits + outputs_loss["pred_boxes"] = pred_boxes + outputs_loss["pred_masks"] = pred_masks + if self.config.auxiliary_loss: + intermediate = decoder_outputs.intermediate_hidden_states if return_dict else decoder_outputs[-1] + outputs_class = self.class_labels_classifier(intermediate) + outputs_coord = self.bbox_predictor(intermediate).sigmoid() + auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord) + outputs_loss["auxiliary_outputs"] = auxiliary_outputs + + loss_dict = criterion(outputs_loss, labels) + # Fourth: compute total loss, as a weighted sum of the various losses + weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient} + weight_dict["loss_giou"] = self.config.giou_loss_coefficient + weight_dict["loss_mask"] = self.config.mask_loss_coefficient + weight_dict["loss_dice"] = self.config.dice_loss_coefficient + if self.config.auxiliary_loss: + aux_weight_dict = {} + for i in range(self.config.decoder_layers - 1): + aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) + weight_dict.update(aux_weight_dict) + loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) + + if not return_dict: + if auxiliary_outputs is not None: + output = (logits, pred_boxes, pred_masks) + auxiliary_outputs + decoder_outputs + encoder_outputs + else: + output = (logits, pred_boxes, pred_masks) + decoder_outputs + encoder_outputs + return ((loss, loss_dict) + output) if loss is not None else output + + return DetrSegmentationOutput( + loss=loss, + loss_dict=loss_dict, + logits=logits, + pred_boxes=pred_boxes, + pred_masks=pred_masks, + auxiliary_outputs=auxiliary_outputs, + last_hidden_state=decoder_outputs.last_hidden_state, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +def _expand(tensor, length: int): + return tensor.unsqueeze(1).repeat(1, int(length), 1, 1, 1).flatten(0, 1) + + +# taken from https://github.com/facebookresearch/detr/blob/master/models/segmentation.py +class DetrMaskHeadSmallConv(nn.Module): + """ + Simple convolutional head, using group norm. Upsampling is done using a FPN approach + """ + + def __init__(self, dim, fpn_dims, context_dim): + super().__init__() + + assert ( + dim % 8 == 0 + ), "The hidden_size + number of attention heads must be divisible by 8 as the number of groups in GroupNorm is set to 8" + + inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64] + + self.lay1 = torch.nn.Conv2d(dim, dim, 3, padding=1) + self.gn1 = torch.nn.GroupNorm(8, dim) + self.lay2 = torch.nn.Conv2d(dim, inter_dims[1], 3, padding=1) + self.gn2 = torch.nn.GroupNorm(8, inter_dims[1]) + self.lay3 = torch.nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1) + self.gn3 = torch.nn.GroupNorm(8, inter_dims[2]) + self.lay4 = torch.nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1) + self.gn4 = torch.nn.GroupNorm(8, inter_dims[3]) + self.lay5 = torch.nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1) + self.gn5 = torch.nn.GroupNorm(8, inter_dims[4]) + self.out_lay = torch.nn.Conv2d(inter_dims[4], 1, 3, padding=1) + + self.dim = dim + + self.adapter1 = torch.nn.Conv2d(fpn_dims[0], inter_dims[1], 1) + self.adapter2 = torch.nn.Conv2d(fpn_dims[1], inter_dims[2], 1) + self.adapter3 = torch.nn.Conv2d(fpn_dims[2], inter_dims[3], 1) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_uniform_(m.weight, a=1) + nn.init.constant_(m.bias, 0) + + def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]): + # here we concatenate x, the projected feature map, of shape (batch_size, d_model, heigth/32, width/32) with + # the bbox_mask = the attention maps of shape (batch_size, n_queries, n_heads, height/32, width/32). + # We expand the projected feature map to match the number of heads. + x = torch.cat([_expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1) + + x = self.lay1(x) + x = self.gn1(x) + x = F.relu(x) + x = self.lay2(x) + x = self.gn2(x) + x = F.relu(x) + + cur_fpn = self.adapter1(fpns[0]) + if cur_fpn.size(0) != x.size(0): + cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0)) + x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") + x = self.lay3(x) + x = self.gn3(x) + x = F.relu(x) + + cur_fpn = self.adapter2(fpns[1]) + if cur_fpn.size(0) != x.size(0): + cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0)) + x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") + x = self.lay4(x) + x = self.gn4(x) + x = F.relu(x) + + cur_fpn = self.adapter3(fpns[2]) + if cur_fpn.size(0) != x.size(0): + cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0)) + x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") + x = self.lay5(x) + x = self.gn5(x) + x = F.relu(x) + + x = self.out_lay(x) + return x + + +class DetrMHAttentionMap(nn.Module): + """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)""" + + def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True, std=None): + super().__init__() + self.num_heads = num_heads + self.hidden_dim = hidden_dim + self.dropout = nn.Dropout(dropout) + + self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias) + self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias) + + self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5 + + def forward(self, q, k, mask: Optional[Tensor] = None): + q = self.q_linear(q) + k = F.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias) + queries_per_head = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads) + keys_per_head = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1]) + weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head) + + if mask is not None: + weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), float("-inf")) + weights = F.softmax(weights.flatten(2), dim=-1).view(weights.size()) + weights = self.dropout(weights) + return weights + + +def dice_loss(inputs, targets, num_boxes): + """ + Compute the DICE loss, similar to generalized IOU for masks + + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs (0 for the negative class and 1 for the positive + class). + """ + inputs = inputs.sigmoid() + inputs = inputs.flatten(1) + numerator = 2 * (inputs * targets).sum(1) + denominator = inputs.sum(-1) + targets.sum(-1) + loss = 1 - (numerator + 1) / (denominator + 1) + return loss.sum() / num_boxes + + +def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): + """ + Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. + + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs (0 for the negative class and 1 for the positive + class). + alpha: (optional) Weighting factor in range (0,1) to balance + positive vs negative examples. Default = -1 (no weighting). + gamma: Exponent of the modulating factor (1 - p_t) to + balance easy vs hard examples. + + Returns: + Loss tensor + """ + prob = inputs.sigmoid() + ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") + p_t = prob * targets + (1 - prob) * (1 - targets) + loss = ce_loss * ((1 - p_t) ** gamma) + + if alpha >= 0: + alpha_t = alpha * targets + (1 - alpha) * (1 - targets) + loss = alpha_t * loss + + return loss.mean(1).sum() / num_boxes + + +# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py +class DetrLoss(nn.Module): + """ + This class computes the losses for DetrForObjectDetection/DetrForSegmentation. The process happens in two steps: 1) + we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair + of matched ground-truth / prediction (supervise class and box) + """ + + def __init__(self, matcher, num_classes, eos_coef, losses): + """ + Create the criterion. + + A note on the num_classes parameter (copied from original repo in detr.py): "the naming of the `num_classes` + parameter of the criterion is somewhat misleading. it indeed corresponds to `max_obj_id + 1`, where max_obj_id + is the maximum id for a class in your dataset. For example, COCO has a max_obj_id of 90, so we pass + `num_classes` to be 91. As another example, for a dataset that has a single class with id 1, you should pass + `num_classes` to be 2 (max_obj_id + 1). For more details on this, check the following discussion + https://github.com/facebookresearch/detr/issues/108#issuecomment-650269223" + + Parameters: + matcher: module able to compute a matching between targets and proposals. + num_classes: number of object categories, omitting the special no-object category. + weight_dict: dict containing as key the names of the losses and as values their relative weight. + eos_coef: relative classification weight applied to the no-object category. + losses: list of all the losses to be applied. See get_loss for list of available losses. + """ + super().__init__() + self.num_classes = num_classes + self.matcher = matcher + self.eos_coef = eos_coef + self.losses = losses + empty_weight = torch.ones(self.num_classes + 1) + empty_weight[-1] = self.eos_coef + self.register_buffer("empty_weight", empty_weight) + + # removed logging parameter, which was part of the original implementation + def loss_labels(self, outputs, targets, indices, num_boxes): + """ + Classification loss (NLL) targets dicts must contain the key "class_labels" containing a tensor of dim + [nb_target_boxes] + """ + assert "logits" in outputs, "No logits were found in the outputs" + src_logits = outputs["logits"] + + idx = self._get_src_permutation_idx(indices) + target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)]) + target_classes = torch.full( + src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device + ) + target_classes[idx] = target_classes_o + + loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight) + losses = {"loss_ce": loss_ce} + + return losses + + @torch.no_grad() + def loss_cardinality(self, outputs, targets, indices, num_boxes): + """ + Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes. + + This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients. + """ + logits = outputs["logits"] + device = logits.device + tgt_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device) + # Count the number of predictions that are NOT "no-object" (which is the last class) + card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1) + card_err = F.l1_loss(card_pred.float(), tgt_lengths.float()) + losses = {"cardinality_error": card_err} + return losses + + def loss_boxes(self, outputs, targets, indices, num_boxes): + """ + Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss. + + Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes + are expected in format (center_x, center_y, w, h), normalized by the image size. + """ + assert "pred_boxes" in outputs, "No predicted boxes found in outputs" + idx = self._get_src_permutation_idx(indices) + src_boxes = outputs["pred_boxes"][idx] + target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0) + + loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction="none") + + losses = {} + losses["loss_bbox"] = loss_bbox.sum() / num_boxes + + loss_giou = 1 - torch.diag( + generalized_box_iou(center_to_corners_format(src_boxes), center_to_corners_format(target_boxes)) + ) + losses["loss_giou"] = loss_giou.sum() / num_boxes + return losses + + def loss_masks(self, outputs, targets, indices, num_boxes): + """ + Compute the losses related to the masks: the focal loss and the dice loss. + + Targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]. + """ + assert "pred_masks" in outputs, "No predicted masks found in outputs" + + src_idx = self._get_src_permutation_idx(indices) + tgt_idx = self._get_tgt_permutation_idx(indices) + src_masks = outputs["pred_masks"] + src_masks = src_masks[src_idx] + masks = [t["masks"] for t in targets] + # TODO use valid to mask invalid areas due to padding in loss + target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() + target_masks = target_masks.to(src_masks) + target_masks = target_masks[tgt_idx] + + # upsample predictions to the target size + src_masks = F.interpolate( + src_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False + ) + src_masks = src_masks[:, 0].flatten(1) + + target_masks = target_masks.flatten(1) + target_masks = target_masks.view(src_masks.shape) + losses = { + "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes), + "loss_dice": dice_loss(src_masks, target_masks, num_boxes), + } + return losses + + def _get_src_permutation_idx(self, indices): + # permute predictions following indices + batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) + src_idx = torch.cat([src for (src, _) in indices]) + return batch_idx, src_idx + + def _get_tgt_permutation_idx(self, indices): + # permute targets following indices + batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) + tgt_idx = torch.cat([tgt for (_, tgt) in indices]) + return batch_idx, tgt_idx + + def get_loss(self, loss, outputs, targets, indices, num_boxes): + loss_map = { + "labels": self.loss_labels, + "cardinality": self.loss_cardinality, + "boxes": self.loss_boxes, + "masks": self.loss_masks, + } + assert loss in loss_map, f"Loss {loss} not supported" + return loss_map[loss](outputs, targets, indices, num_boxes) + + def forward(self, outputs, targets): + """ + This performs the loss computation. + + Parameters: + outputs: dict of tensors, see the output specification of the model for the format + targets: list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc + """ + outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"} + + # Retrieve the matching between the outputs of the last layer and the targets + indices = self.matcher(outputs_without_aux, targets) + + # Compute the average number of target boxes accross all nodes, for normalization purposes + num_boxes = sum(len(t["class_labels"]) for t in targets) + num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) + # (Niels): comment out function below, distributed training to be added + # if is_dist_avail_and_initialized(): + # torch.distributed.all_reduce(num_boxes) + # (Niels) in original implementation, num_boxes is divided by get_world_size() + num_boxes = torch.clamp(num_boxes, min=1).item() + + # Compute all the requested losses + losses = {} + for loss in self.losses: + losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes)) + + # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. + if "auxiliary_outputs" in outputs: + for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]): + indices = self.matcher(auxiliary_outputs, targets) + for loss in self.losses: + if loss == "masks": + # Intermediate masks losses are too costly to compute, we ignore them. + continue + l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes) + l_dict = {k + f"_{i}": v for k, v in l_dict.items()} + losses.update(l_dict) + + return losses + + +# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py +class DetrMLPPredictionHead(nn.Module): + """ + Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, + height and width of a bounding box w.r.t. an image. + + Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + + """ + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +# taken from https://github.com/facebookresearch/detr/blob/master/models/matcher.py +class DetrHungarianMatcher(nn.Module): + """ + This class computes an assignment between the targets and the predictions of the network. + + For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more + predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are + un-matched (and thus treated as non-objects). + """ + + def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1): + """ + Creates the matcher. + + Params: + class_cost: This is the relative weight of the classification error in the matching cost + bbox_cost: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost + giou_cost: This is the relative weight of the giou loss of the bounding box in the matching cost + """ + super().__init__() + + requires_backends(self, ["scipy"]) + + self.class_cost = class_cost + self.bbox_cost = bbox_cost + self.giou_cost = giou_cost + assert class_cost != 0 or bbox_cost != 0 or giou_cost != 0, "All costs of the Matcher can't be 0" + + @torch.no_grad() + def forward(self, outputs, targets): + """ + Performs the matching. + + Params: + outputs: This is a dict that contains at least these entries: + "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits + "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates + targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: + "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth + objects in the target) containing the class labels "boxes": Tensor of dim [num_target_boxes, 4] + containing the target box coordinates + + Returns: + A list of size batch_size, containing tuples of (index_i, index_j) where: + + - index_i is the indices of the selected predictions (in order) + - index_j is the indices of the corresponding selected targets (in order) + For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) + """ + bs, num_queries = outputs["logits"].shape[:2] + + # We flatten to compute the cost matrices in a batch + out_prob = outputs["logits"].flatten(0, 1).softmax(-1) # [batch_size * num_queries, num_classes] + out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] + + # Also concat the target labels and boxes + tgt_ids = torch.cat([v["class_labels"] for v in targets]) + tgt_bbox = torch.cat([v["boxes"] for v in targets]) + + # Compute the classification cost. Contrary to the loss, we don't use the NLL, + # but approximate it in 1 - proba[target class]. + # The 1 is a constant that doesn't change the matching, it can be ommitted. + class_cost = -out_prob[:, tgt_ids] + + # Compute the L1 cost between boxes + bbox_cost = torch.cdist(out_bbox, tgt_bbox, p=1) + + # Compute the giou cost between boxes + giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(tgt_bbox)) + + # Final cost matrix + cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost + cost_matrix = cost_matrix.view(bs, num_queries, -1).cpu() + + sizes = [len(v["boxes"]) for v in targets] + indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))] + return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] + + +# below: bounding box utilities taken from https://github.com/facebookresearch/detr/blob/master/util/box_ops.py + + +def _upcast(t: Tensor) -> Tensor: + # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type + if t.is_floating_point(): + return t if t.dtype in (torch.float32, torch.float64) else t.float() + else: + return t if t.dtype in (torch.int32, torch.int64) else t.int() + + +def box_area(boxes: Tensor) -> Tensor: + """ + Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. + + Args: + boxes (Tensor[N, 4]): boxes for which the area will be computed. They + are expected to be in (x1, y1, x2, y2) format with ``0 <= x1 < x2`` and ``0 <= y1 < y2``. + + Returns: + area (Tensor[N]): area for each box + """ + boxes = _upcast(boxes) + return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + + +# modified from torchvision to also return the union +def box_iou(boxes1, boxes2): + area1 = box_area(boxes1) + area2 = box_area(boxes2) + + lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] + rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] + + wh = (rb - lt).clamp(min=0) # [N,M,2] + inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] + + union = area1[:, None] + area2 - inter + + iou = inter / union + return iou, union + + +def generalized_box_iou(boxes1, boxes2): + """ + Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format. + + Returns: + a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2) + """ + # degenerate boxes gives inf / nan results + # so do an early check + assert (boxes1[:, 2:] >= boxes1[:, :2]).all() + assert (boxes2[:, 2:] >= boxes2[:, :2]).all() + iou, union = box_iou(boxes1, boxes2) + + lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) + rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) + + wh = (rb - lt).clamp(min=0) # [N,M,2] + area = wh[:, :, 0] * wh[:, :, 1] + + return iou - (area - union) / area + + +# below: taken from https://github.com/facebookresearch/detr/blob/master/util/misc.py#L306 + + +def _max_by_axis(the_list): + # type: (List[List[int]]) -> List[int] + maxes = the_list[0] + for sublist in the_list[1:]: + for index, item in enumerate(sublist): + maxes[index] = max(maxes[index], item) + return maxes + + +class NestedTensor(object): + def __init__(self, tensors, mask: Optional[Tensor]): + self.tensors = tensors + self.mask = mask + + def to(self, device): + # type: (Device) -> NestedTensor # noqa + cast_tensor = self.tensors.to(device) + mask = self.mask + if mask is not None: + cast_mask = mask.to(device) + else: + cast_mask = None + return NestedTensor(cast_tensor, cast_mask) + + def decompose(self): + return self.tensors, self.mask + + def __repr__(self): + return str(self.tensors) + + +def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): + if tensor_list[0].ndim == 3: + max_size = _max_by_axis([list(img.shape) for img in tensor_list]) + batch_shape = [len(tensor_list)] + max_size + b, c, h, w = batch_shape + dtype = tensor_list[0].dtype + device = tensor_list[0].device + tensor = torch.zeros(batch_shape, dtype=dtype, device=device) + mask = torch.ones((b, h, w), dtype=torch.bool, device=device) + for img, pad_img, m in zip(tensor_list, tensor, mask): + pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) + m[: img.shape[1], : img.shape[2]] = False + else: + raise ValueError("Only 3-dimensional tensors are supported") + return NestedTensor(tensor, mask) diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 1e586729615..9bfb9722170 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -39,6 +39,7 @@ from .file_utils import ( is_sentencepiece_available, is_soundfile_availble, is_tf_available, + is_timm_available, is_tokenizers_available, is_torch_available, is_torch_tpu_available, @@ -229,6 +230,19 @@ def require_onnx(test_case): return test_case +def require_timm(test_case): + """ + Decorator marking a test that requires Timm. + + These tests are skipped when Timm isn't installed. + + """ + if not is_timm_available(): + return unittest.skip("test requires Timm")(test_case) + else: + return test_case + + def require_torch(test_case): """ Decorator marking a test that requires PyTorch. diff --git a/src/transformers/utils/coco_classes.py b/src/transformers/utils/coco_classes.py new file mode 100644 index 00000000000..cc540052aef --- /dev/null +++ b/src/transformers/utils/coco_classes.py @@ -0,0 +1,94 @@ +# COCO object detection id's to class names +id2label = { + 0: "N/A", + 1: "person", + 2: "bicycle", + 3: "car", + 4: "motorcycle", + 5: "airplane", + 6: "bus", + 7: "train", + 8: "truck", + 9: "boat", + 10: "traffic light", + 11: "fire hydrant", + 12: "N/A", + 13: "stop sign", + 14: "parking meter", + 15: "bench", + 16: "bird", + 17: "cat", + 18: "dog", + 19: "horse", + 20: "sheep", + 21: "cow", + 22: "elephant", + 23: "bear", + 24: "zebra", + 25: "giraffe", + 26: "N/A", + 27: "backpack", + 28: "umbrella", + 29: "N/A", + 30: "N/A", + 31: "handbag", + 32: "tie", + 33: "suitcase", + 34: "frisbee", + 35: "skis", + 36: "snowboard", + 37: "sports ball", + 38: "kite", + 39: "baseball bat", + 40: "baseball glove", + 41: "skateboard", + 42: "surfboard", + 43: "tennis racket", + 44: "bottle", + 45: "N/A", + 46: "wine glass", + 47: "cup", + 48: "fork", + 49: "knife", + 50: "spoon", + 51: "bowl", + 52: "banana", + 53: "apple", + 54: "sandwich", + 55: "orange", + 56: "broccoli", + 57: "carrot", + 58: "hot dog", + 59: "pizza", + 60: "donut", + 61: "cake", + 62: "chair", + 63: "couch", + 64: "potted plant", + 65: "bed", + 66: "N/A", + 67: "dining table", + 68: "N/A", + 69: "N/A", + 70: "toilet", + 71: "N/A", + 72: "tv", + 73: "laptop", + 74: "mouse", + 75: "remote", + 76: "keyboard", + 77: "cell phone", + 78: "microwave", + 79: "oven", + 80: "toaster", + 81: "sink", + 82: "refrigerator", + 83: "N/A", + 84: "book", + 85: "clock", + 86: "vase", + 87: "scissors", + 88: "teddy bear", + 89: "hair drier", + 90: "toothbrush", +} diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 000a05d31df..036a0a1c5ac 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -334,6 +334,9 @@ MODEL_FOR_MULTIPLE_CHOICE_MAPPING = None MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING = None +MODEL_FOR_OBJECT_DETECTION_MAPPING = None + + MODEL_FOR_PRETRAINING_MAPPING = None diff --git a/src/transformers/utils/dummy_timm_and_vision_objects.py b/src/transformers/utils/dummy_timm_and_vision_objects.py new file mode 100644 index 00000000000..33acdf77725 --- /dev/null +++ b/src/transformers/utils/dummy_timm_and_vision_objects.py @@ -0,0 +1,24 @@ +# This file is autogenerated by the command `make fix-copies`, do not edit. +from ..file_utils import requires_backends + + +DETR_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class DetrForObjectDetection: + def __init__(self, *args, **kwargs): + requires_backends(self, ["timm", "vision"]) + + +class DetrForSegmentation: + def __init__(self, *args, **kwargs): + requires_backends(self, ["timm", "vision"]) + + +class DetrModel: + def __init__(self, *args, **kwargs): + requires_backends(self, ["timm", "vision"]) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_backends(self, ["timm", "vision"]) diff --git a/src/transformers/utils/dummy_timm_objects.py b/src/transformers/utils/dummy_timm_objects.py new file mode 100644 index 00000000000..bc46f681553 --- /dev/null +++ b/src/transformers/utils/dummy_timm_objects.py @@ -0,0 +1,24 @@ +# This file is autogenerated by the command `make fix-copies`, do not edit. +from ..file_utils import requires_backends + + +DETR_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class DetrForObjectDetection: + def __init__(self, *args, **kwargs): + requires_backends(self, ["timm"]) + + +class DetrForSegmentation: + def __init__(self, *args, **kwargs): + requires_backends(self, ["timm"]) + + +class DetrModel: + def __init__(self, *args, **kwargs): + requires_backends(self, ["timm"]) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_backends(self, ["timm"]) diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index 1798c9f73c8..84b37d35dfb 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -22,6 +22,11 @@ class DeiTFeatureExtractor: requires_backends(self, ["vision"]) +class DetrFeatureExtractor: + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + class ViTFeatureExtractor: def __init__(self, *args, **kwargs): requires_backends(self, ["vision"]) diff --git a/tests/fixtures/coco.jpg b/tests/fixtures/coco.jpg deleted file mode 100644 index d32344928e3..00000000000 Binary files a/tests/fixtures/coco.jpg and /dev/null differ diff --git a/tests/fixtures/tests_samples/.gitignore b/tests/fixtures/tests_samples/.gitignore index f5030eb61e7..1d7141c43dc 100644 --- a/tests/fixtures/tests_samples/.gitignore +++ b/tests/fixtures/tests_samples/.gitignore @@ -1,4 +1,3 @@ -*.* cache* temp* !*.txt diff --git a/tests/fixtures/tests_samples/COCO/cats.png b/tests/fixtures/tests_samples/COCO/000000039769.png similarity index 100% rename from tests/fixtures/tests_samples/COCO/cats.png rename to tests/fixtures/tests_samples/COCO/000000039769.png diff --git a/tests/fixtures/tests_samples/COCO/coco_annotations.txt b/tests/fixtures/tests_samples/COCO/coco_annotations.txt new file mode 100644 index 00000000000..bd8c86a9bc3 --- /dev/null +++ b/tests/fixtures/tests_samples/COCO/coco_annotations.txt @@ -0,0 +1 @@ +[{"segmentation": [[333.96, 175.14, 338.26, 134.33, 342.55, 95.67, 348.99, 79.57, 368.32, 80.64, 371.54, 91.38, 364.03, 106.41, 356.51, 145.07, 351.14, 166.55, 350.07, 184.8, 345.77, 185.88, 332.89, 178.36, 332.89, 172.99]], "area": 2120.991099999999, "iscrowd": 0, "image_id": 39769, "bbox": [332.89, 79.57, 38.65, 106.31], "category_id": 75, "id": 1108446}, {"segmentation": [[44.03, 86.01, 112.75, 74.2, 173.96, 77.42, 175.03, 89.23, 170.74, 98.9, 147.11, 102.12, 54.77, 119.3, 53.69, 119.3, 44.03, 113.93, 41.88, 94.6, 41.88, 94.6]], "area": 4052.607, "iscrowd": 0, "image_id": 39769, "bbox": [41.88, 74.2, 133.15, 45.1], "category_id": 75, "id": 1110067}, {"segmentation": [[1.08, 473.53, 633.17, 473.53, 557.66, 376.45, 535.01, 366.74, 489.71, 305.26, 470.29, 318.2, 456.27, 351.64, 413.12, 363.51, 376.45, 358.11, 348.4, 350.56, 363.51, 331.15, 357.03, 288.0, 353.8, 257.8, 344.09, 190.92, 333.3, 177.98, 345.17, 79.82, 284.76, 130.52, 265.35, 151.01, 308.49, 189.84, 317.12, 215.73, 293.39, 243.78, 269.66, 212.49, 235.15, 199.55, 214.65, 193.08, 187.69, 217.89, 159.64, 278.29, 135.91, 313.89, 169.35, 292.31, 203.87, 281.53, 220.04, 292.31, 220.04, 307.42, 175.82, 345.17, 155.33, 360.27, 105.71, 363.51, 85.21, 374.29, 74.43, 366.74, 70.11, 465.98, 42.07, 471.37, 33.44, 457.35, 34.52, 414.2, 29.12, 368.9, 9.71, 291.24, 46.38, 209.26, 99.24, 128.36, 131.6, 107.87, 50.7, 117.57, 40.99, 103.55, 40.99, 85.21, 60.4, 77.66, 141.3, 70.11, 173.66, 72.27, 174.74, 92.76, 204.94, 72.27, 225.44, 62.56, 262.11, 56.09, 292.31, 53.93, 282.61, 81.98, 298.79, 96.0, 310.65, 102.47, 348.4, 74.43, 373.21, 81.98, 430.38, 35.6, 484.31, 23.73, 540.4, 46.38, 593.26, 66.88, 638.56, 80.9, 632.09, 145.62, 581.39, 118.65, 543.64, 130.52, 533.93, 167.19, 512.36, 197.39, 498.34, 218.97, 529.62, 253.48, 549.03, 273.98, 584.63, 276.13, 587.87, 293.39, 566.29, 305.26, 531.78, 298.79, 549.03, 319.28, 576.0, 358.11, 560.9, 376.45, 639.64, 471.37, 639.64, 2.16, 1.08, 0.0]], "area": 176277.55269999994, "iscrowd": 0, "image_id": 39769, "bbox": [1.08, 0.0, 638.56, 473.53], "category_id": 63, "id": 1605237}, {"segmentation": [[1.07, 1.18, 640.0, 3.33, 638.93, 472.59, 4.3, 479.03]], "area": 301552.6694999999, "iscrowd": 0, "image_id": 39769, "bbox": [1.07, 1.18, 638.93, 477.85], "category_id": 65, "id": 1612051}, {"segmentation": [[138.75, 319.38, 148.75, 294.38, 165.0, 246.87, 197.5, 205.63, 247.5, 203.13, 268.75, 216.88, 280.0, 239.38, 293.75, 244.38, 303.75, 241.88, 307.5, 228.13, 318.75, 220.63, 315.0, 200.63, 291.25, 171.88, 265.0, 156.88, 258.75, 148.13, 262.5, 135.63, 282.5, 123.13, 292.5, 115.63, 311.25, 108.13, 313.75, 106.88, 296.25, 93.13, 282.5, 84.38, 292.5, 64.38, 288.75, 60.63, 266.25, 54.38, 232.5, 63.12, 206.25, 70.63, 170.0, 100.63, 136.25, 114.38, 101.25, 138.13, 56.25, 194.38, 27.5, 259.38, 17.5, 299.38, 32.5, 378.13, 31.25, 448.13, 41.25, 469.38, 66.25, 466.88, 70.0, 419.38, 71.25, 391.88, 77.5, 365.63, 113.75, 364.38, 145.0, 360.63, 168.75, 349.38, 191.25, 330.63, 212.5, 319.38, 223.75, 305.63, 206.25, 286.88, 172.5, 288.13]], "area": 53301.618749999994, "iscrowd": 0, "image_id": 39769, "bbox": [17.5, 54.38, 301.25, 415.0], "category_id": 17, "id": 2190839}, {"segmentation": [[543.75, 136.88, 570.0, 114.38, 591.25, 123.13, 616.25, 140.63, 640.0, 143.13, 636.25, 124.37, 605.0, 103.13, 640.0, 103.13, 633.75, 86.88, 587.5, 73.13, 548.75, 49.38, 505.0, 35.63, 462.5, 25.63, 405.0, 48.13, 362.5, 111.88, 347.5, 179.38, 355.0, 220.63, 356.25, 230.63, 365.0, 264.38, 358.75, 266.88, 358.75, 270.63, 356.25, 291.88, 356.25, 325.63, 355.0, 338.13, 350.0, 348.13, 365.0, 354.38, 396.25, 351.88, 423.75, 355.63, 446.25, 350.63, 460.0, 345.63, 462.5, 321.88, 468.75, 306.88, 481.25, 299.38, 516.25, 341.88, 536.25, 368.13, 570.0, 369.38, 578.75, 359.38, 555.0, 330.63, 532.5, 298.13, 563.75, 299.38, 582.5, 298.13, 586.25, 286.88, 578.75, 278.13, 548.75, 269.38, 525.0, 256.88, 505.0, 206.88, 536.25, 161.88, 540.0, 149.38]], "area": 59700.95625, "iscrowd": 0, "image_id": 39769, "bbox": [347.5, 25.63, 292.5, 343.75], "category_id": 17, "id": 2190842}] \ No newline at end of file diff --git a/tests/fixtures/tests_samples/COCO/coco_panoptic/000000039769.png b/tests/fixtures/tests_samples/COCO/coco_panoptic/000000039769.png new file mode 100644 index 00000000000..9dc23525d6e Binary files /dev/null and b/tests/fixtures/tests_samples/COCO/coco_panoptic/000000039769.png differ diff --git a/tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt b/tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt new file mode 100644 index 00000000000..90a9798be2a --- /dev/null +++ b/tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt @@ -0,0 +1 @@ +[{"id": 8222595, "category_id": 17, "iscrowd": 0, "bbox": [18, 54, 301, 415], "area": 53306}, {"id": 8225432, "category_id": 17, "iscrowd": 0, "bbox": [349, 26, 291, 343], "area": 59627}, {"id": 8798150, "category_id": 63, "iscrowd": 0, "bbox": [1, 0, 639, 474], "area": 174579}, {"id": 14466198, "category_id": 75, "iscrowd": 0, "bbox": [42, 74, 133, 45], "area": 4068}, {"id": 12821912, "category_id": 75, "iscrowd": 0, "bbox": [333, 80, 38, 106], "area": 2118}, {"id": 10898909, "category_id": 93, "iscrowd": 0, "bbox": [0, 0, 640, 480], "area": 2750}] \ No newline at end of file diff --git a/tests/test_feature_extraction_common.py b/tests/test_feature_extraction_common.py index 49dfa6dfd4d..217da135ca1 100644 --- a/tests/test_feature_extraction_common.py +++ b/tests/test_feature_extraction_common.py @@ -18,6 +18,57 @@ import json import os import tempfile +from transformers.file_utils import is_torch_available, is_vision_available + + +if is_torch_available(): + import numpy as np + import torch + +if is_vision_available(): + from PIL import Image + + +def prepare_image_inputs(feature_extract_tester, equal_resolution=False, numpify=False, torchify=False): + """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True, + or a list of PyTorch tensors if one specifies torchify=True. + """ + + assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time" + + if equal_resolution: + image_inputs = [] + for i in range(feature_extract_tester.batch_size): + image_inputs.append( + np.random.randint( + 255, + size=( + feature_extract_tester.num_channels, + feature_extract_tester.max_resolution, + feature_extract_tester.max_resolution, + ), + dtype=np.uint8, + ) + ) + else: + image_inputs = [] + for i in range(feature_extract_tester.batch_size): + width, height = np.random.choice( + np.arange(feature_extract_tester.min_resolution, feature_extract_tester.max_resolution), 2 + ) + image_inputs.append( + np.random.randint(255, size=(feature_extract_tester.num_channels, width, height), dtype=np.uint8) + ) + + if not numpify and not torchify: + # PIL expects the channel dimension as last dimension + image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs] + + if torchify: + image_inputs = [torch.from_numpy(x) for x in image_inputs] + + return image_inputs + class FeatureExtractionSavingTestMixin: def test_feat_extract_to_json_string(self): diff --git a/tests/test_feature_extraction_deit.py b/tests/test_feature_extraction_deit.py index a2b60eafe6e..dc86074dc98 100644 --- a/tests/test_feature_extraction_deit.py +++ b/tests/test_feature_extraction_deit.py @@ -21,7 +21,7 @@ import numpy as np from transformers.file_utils import is_torch_available, is_vision_available from transformers.testing_utils import require_torch, require_vision -from .test_feature_extraction_common import FeatureExtractionSavingTestMixin +from .test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs if is_torch_available(): @@ -75,36 +75,6 @@ class DeiTFeatureExtractionTester(unittest.TestCase): "image_std": self.image_std, } - def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False): - """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True, - or a list of PyTorch tensors if one specifies torchify=True. - """ - - assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time" - - if equal_resolution: - image_inputs = [] - for i in range(self.batch_size): - image_inputs.append( - np.random.randint( - 255, size=(self.num_channels, self.max_resolution, self.max_resolution), dtype=np.uint8 - ) - ) - else: - image_inputs = [] - for i in range(self.batch_size): - width, height = np.random.choice(np.arange(self.min_resolution, self.max_resolution), 2) - image_inputs.append(np.random.randint(255, size=(self.num_channels, width, height), dtype=np.uint8)) - - if not numpify and not torchify: - # PIL expects the channel dimension as last dimension - image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs] - - if torchify: - image_inputs = [torch.from_numpy(x) for x in image_inputs] - - return image_inputs - @require_torch @require_vision @@ -136,7 +106,7 @@ class DeiTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestC # Initialize feature_extractor feature_extractor = self.feature_extraction_class(**self.feat_extract_dict) # create random PIL images - image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False) + image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False) for image in image_inputs: self.assertIsInstance(image, Image.Image) @@ -168,7 +138,7 @@ class DeiTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestC # Initialize feature_extractor feature_extractor = self.feature_extraction_class(**self.feat_extract_dict) # create random numpy tensors - image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, numpify=True) + image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True) for image in image_inputs: self.assertIsInstance(image, np.ndarray) @@ -200,7 +170,7 @@ class DeiTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestC # Initialize feature_extractor feature_extractor = self.feature_extraction_class(**self.feat_extract_dict) # create random PyTorch tensors - image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, torchify=True) + image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True) for image in image_inputs: self.assertIsInstance(image, torch.Tensor) diff --git a/tests/test_feature_extraction_detr.py b/tests/test_feature_extraction_detr.py new file mode 100644 index 00000000000..8f36ad418f5 --- /dev/null +++ b/tests/test_feature_extraction_detr.py @@ -0,0 +1,339 @@ +# coding=utf-8 +# Copyright 2021 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import pathlib +import unittest + +import numpy as np + +from transformers.file_utils import is_torch_available, is_vision_available +from transformers.testing_utils import require_torch, require_vision, slow + +from .test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import DetrFeatureExtractor + + +class DetrFeatureExtractionTester(unittest.TestCase): + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=18, + max_size=1333, # by setting max_size > max_resolution we're effectively not testing this :p + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + ): + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.max_size = max_size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + + def prepare_feat_extract_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "max_size": self.max_size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + } + + def get_expected_values(self, image_inputs, batched=False): + """ + This function computes the expected height and width when providing images to DetrFeatureExtractor, + assuming do_resize is set to True with a scalar size. + """ + if not batched: + image = image_inputs[0] + if isinstance(image, Image.Image): + w, h = image.size + else: + h, w = image.shape[1], image.shape[2] + if w < h: + expected_height = int(self.size * h / w) + expected_width = self.size + elif w > h: + expected_height = self.size + expected_width = int(self.size * w / h) + else: + expected_height = self.size + expected_width = self.size + + else: + expected_values = [] + for image in image_inputs: + expected_height, expected_width = self.get_expected_values([image]) + expected_values.append((expected_height, expected_width)) + expected_height = max(expected_values, key=lambda item: item[0])[0] + expected_width = max(expected_values, key=lambda item: item[1])[1] + + return expected_height, expected_width + + +@require_torch +@require_vision +class DetrFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase): + + feature_extraction_class = DetrFeatureExtractor if is_vision_available() else None + + def setUp(self): + self.feature_extract_tester = DetrFeatureExtractionTester(self) + + @property + def feat_extract_dict(self): + return self.feature_extract_tester.prepare_feat_extract_dict() + + def test_feat_extract_properties(self): + feature_extractor = self.feature_extraction_class(**self.feat_extract_dict) + self.assertTrue(hasattr(feature_extractor, "image_mean")) + self.assertTrue(hasattr(feature_extractor, "image_std")) + self.assertTrue(hasattr(feature_extractor, "do_normalize")) + self.assertTrue(hasattr(feature_extractor, "do_resize")) + self.assertTrue(hasattr(feature_extractor, "size")) + self.assertTrue(hasattr(feature_extractor, "max_size")) + + def test_batch_feature(self): + pass + + def test_call_pil(self): + # Initialize feature_extractor + feature_extractor = self.feature_extraction_class(**self.feat_extract_dict) + # create random PIL images + image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False) + for image in image_inputs: + self.assertIsInstance(image, Image.Image) + + # Test not batched input + encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values + + expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs) + + self.assertEqual( + encoded_images.shape, + (1, self.feature_extract_tester.num_channels, expected_height, expected_width), + ) + + # Test batched + expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True) + + encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values + self.assertEqual( + encoded_images.shape, + ( + self.feature_extract_tester.batch_size, + self.feature_extract_tester.num_channels, + expected_height, + expected_width, + ), + ) + + def test_call_numpy(self): + # Initialize feature_extractor + feature_extractor = self.feature_extraction_class(**self.feat_extract_dict) + # create random numpy tensors + image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True) + for image in image_inputs: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input + encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values + + expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs) + + self.assertEqual( + encoded_images.shape, + (1, self.feature_extract_tester.num_channels, expected_height, expected_width), + ) + + # Test batched + encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values + + expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True) + + self.assertEqual( + encoded_images.shape, + ( + self.feature_extract_tester.batch_size, + self.feature_extract_tester.num_channels, + expected_height, + expected_width, + ), + ) + + def test_call_pytorch(self): + # Initialize feature_extractor + feature_extractor = self.feature_extraction_class(**self.feat_extract_dict) + # create random PyTorch tensors + image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True) + for image in image_inputs: + self.assertIsInstance(image, torch.Tensor) + + # Test not batched input + encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values + + expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs) + + self.assertEqual( + encoded_images.shape, + (1, self.feature_extract_tester.num_channels, expected_height, expected_width), + ) + + # Test batched + encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values + + expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True) + + self.assertEqual( + encoded_images.shape, + ( + self.feature_extract_tester.batch_size, + self.feature_extract_tester.num_channels, + expected_height, + expected_width, + ), + ) + + def test_equivalence_pad_and_create_pixel_mask(self): + # Initialize feature_extractors + feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict) + feature_extractor_2 = self.feature_extraction_class(do_resize=False, do_normalize=False) + # create random PyTorch tensors + image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True) + for image in image_inputs: + self.assertIsInstance(image, torch.Tensor) + + # Test whether the method "pad_and_return_pixel_mask" and calling the feature extractor return the same tensors + encoded_images_with_method = feature_extractor_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt") + encoded_images = feature_extractor_2(image_inputs, return_tensors="pt") + + assert torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4) + assert torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4) + + @slow + def test_call_pytorch_with_coco_detection_annotations(self): + # prepare image and target + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f: + target = json.loads(f.read()) + + target = {"image_id": 39769, "annotations": target} + + # encode them + # TODO replace by facebook/detr-resnet-50 + feature_extractor = DetrFeatureExtractor.from_pretrained("nielsr/detr-resnet-50") + encoding = feature_extractor(images=image, annotations=target, return_tensors="pt") + + # verify pixel values + expected_shape = torch.Size([1, 3, 800, 1066]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) + assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4) + + # verify area + expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438]) + assert torch.allclose(encoding["target"][0]["area"], expected_area) + # verify boxes + expected_boxes_shape = torch.Size([6, 4]) + self.assertEqual(encoding["target"][0]["boxes"].shape, expected_boxes_shape) + expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215]) + assert torch.allclose(encoding["target"][0]["boxes"][0], expected_boxes_slice, atol=1e-3) + # verify image_id + expected_image_id = torch.tensor([39769]) + assert torch.allclose(encoding["target"][0]["image_id"], expected_image_id) + # verify is_crowd + expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) + assert torch.allclose(encoding["target"][0]["iscrowd"], expected_is_crowd) + # verify class_labels + expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17]) + assert torch.allclose(encoding["target"][0]["class_labels"], expected_class_labels) + # verify orig_size + expected_orig_size = torch.tensor([480, 640]) + assert torch.allclose(encoding["target"][0]["orig_size"], expected_orig_size) + # verify size + expected_size = torch.tensor([800, 1066]) + assert torch.allclose(encoding["target"][0]["size"], expected_size) + + @slow + def test_call_pytorch_with_coco_panoptic_annotations(self): + # prepare image, target and masks_path + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f: + target = json.loads(f.read()) + + target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target} + + masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") + + # encode them + # TODO replace by .from_pretrained facebook/detr-resnet-50-panoptic + feature_extractor = DetrFeatureExtractor(format="coco_panoptic") + encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt") + + # verify pixel values + expected_shape = torch.Size([1, 3, 800, 1066]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) + assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4) + + # verify area + expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147]) + assert torch.allclose(encoding["target"][0]["area"], expected_area) + # verify boxes + expected_boxes_shape = torch.Size([6, 4]) + self.assertEqual(encoding["target"][0]["boxes"].shape, expected_boxes_shape) + expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625]) + assert torch.allclose(encoding["target"][0]["boxes"][0], expected_boxes_slice, atol=1e-3) + # verify image_id + expected_image_id = torch.tensor([39769]) + assert torch.allclose(encoding["target"][0]["image_id"], expected_image_id) + # verify is_crowd + expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) + assert torch.allclose(encoding["target"][0]["iscrowd"], expected_is_crowd) + # verify class_labels + expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93]) + assert torch.allclose(encoding["target"][0]["class_labels"], expected_class_labels) + # verify masks + expected_masks_sum = 822338 + self.assertEqual(encoding["target"][0]["masks"].sum().item(), expected_masks_sum) + # verify orig_size + expected_orig_size = torch.tensor([480, 640]) + assert torch.allclose(encoding["target"][0]["orig_size"], expected_orig_size) + # verify size + expected_size = torch.tensor([800, 1066]) + assert torch.allclose(encoding["target"][0]["size"], expected_size) diff --git a/tests/test_feature_extraction_vit.py b/tests/test_feature_extraction_vit.py index 5c8db9baa63..283a94d8ac1 100644 --- a/tests/test_feature_extraction_vit.py +++ b/tests/test_feature_extraction_vit.py @@ -21,7 +21,7 @@ import numpy as np from transformers.file_utils import is_torch_available, is_vision_available from transformers.testing_utils import require_torch, require_vision -from .test_feature_extraction_common import FeatureExtractionSavingTestMixin +from .test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs if is_torch_available(): @@ -69,36 +69,6 @@ class ViTFeatureExtractionTester(unittest.TestCase): "size": self.size, } - def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False): - """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True, - or a list of PyTorch tensors if one specifies torchify=True. - """ - - assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time" - - if equal_resolution: - image_inputs = [] - for i in range(self.batch_size): - image_inputs.append( - np.random.randint( - 255, size=(self.num_channels, self.max_resolution, self.max_resolution), dtype=np.uint8 - ) - ) - else: - image_inputs = [] - for i in range(self.batch_size): - width, height = np.random.choice(np.arange(self.min_resolution, self.max_resolution), 2) - image_inputs.append(np.random.randint(255, size=(self.num_channels, width, height), dtype=np.uint8)) - - if not numpify and not torchify: - # PIL expects the channel dimension as last dimension - image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs] - - if torchify: - image_inputs = [torch.from_numpy(x) for x in image_inputs] - - return image_inputs - @require_torch @require_vision @@ -128,7 +98,7 @@ class ViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCa # Initialize feature_extractor feature_extractor = self.feature_extraction_class(**self.feat_extract_dict) # create random PIL images - image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False) + image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False) for image in image_inputs: self.assertIsInstance(image, Image.Image) @@ -160,7 +130,7 @@ class ViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCa # Initialize feature_extractor feature_extractor = self.feature_extraction_class(**self.feat_extract_dict) # create random numpy tensors - image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, numpify=True) + image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True) for image in image_inputs: self.assertIsInstance(image, np.ndarray) @@ -192,7 +162,7 @@ class ViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCa # Initialize feature_extractor feature_extractor = self.feature_extraction_class(**self.feat_extract_dict) # create random PyTorch tensors - image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, torchify=True) + image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True) for image in image_inputs: self.assertIsInstance(image, torch.Tensor) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 7223bfa5376..272f25a0ecf 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -21,7 +21,7 @@ import random import tempfile import unittest import warnings -from typing import List, Tuple +from typing import Dict, List, Tuple from huggingface_hub import HfApi from requests.exceptions import HTTPError @@ -982,7 +982,6 @@ class ModelTesterMixin: outputs = model(**inputs) - print(outputs) output = outputs[0] if config.is_encoder_decoder: @@ -1236,6 +1235,11 @@ class ModelTesterMixin: if isinstance(tuple_object, (List, Tuple)): for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): recursive_check(tuple_iterable_value, dict_iterable_value) + elif isinstance(tuple_object, Dict): + for tuple_iterable_value, dict_iterable_value in zip( + tuple_object.values(), dict_object.values() + ): + recursive_check(tuple_iterable_value, dict_iterable_value) elif tuple_object is None: return else: diff --git a/tests/test_modeling_deit.py b/tests/test_modeling_deit.py index d4d95f0b491..5551da08903 100644 --- a/tests/test_modeling_deit.py +++ b/tests/test_modeling_deit.py @@ -360,7 +360,7 @@ class DeiTModelTest(ModelTesterMixin, unittest.TestCase): # We will verify our results on an image of cute cats def prepare_img(): - image = Image.open("./tests/fixtures/tests_samples/COCO/cats.png") + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") return image diff --git a/tests/test_modeling_detr.py b/tests/test_modeling_detr.py new file mode 100644 index 00000000000..093e75cf993 --- /dev/null +++ b/tests/test_modeling_detr.py @@ -0,0 +1,527 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch DETR model. """ + + +import inspect +import math +import unittest + +from transformers import is_timm_available, is_vision_available +from transformers.file_utils import cached_property +from transformers.testing_utils import require_timm, require_vision, slow, torch_device + +from .test_configuration_common import ConfigTester +from .test_generation_utils import GenerationTesterMixin +from .test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor + + +if is_timm_available(): + import torch + + from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrModel + + +if is_vision_available(): + from PIL import Image + + from transformers import DetrFeatureExtractor + + +@require_timm +class DetrModelTester: + def __init__( + self, + parent, + batch_size=8, + is_training=True, + use_labels=True, + hidden_size=256, + num_hidden_layers=2, + num_attention_heads=8, + intermediate_size=4, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + num_queries=12, + num_channels=3, + min_size=200, + max_size=200, + n_targets=8, + num_labels=91, + ): + self.parent = parent + self.batch_size = batch_size + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.num_queries = num_queries + self.num_channels = num_channels + self.min_size = min_size + self.max_size = max_size + self.n_targets = n_targets + self.num_labels = num_labels + + # we also set the expected seq length for both encoder and decoder + self.encoder_seq_length = math.ceil(self.min_size / 32) * math.ceil(self.max_size / 32) + self.decoder_seq_length = self.num_queries + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.min_size, self.max_size]) + + pixel_mask = torch.ones([self.batch_size, self.min_size, self.max_size], device=torch_device) + + labels = None + if self.use_labels: + # labels is a list of Dict (each Dict being the labels for a given example in the batch) + labels = [] + for i in range(self.batch_size): + target = {} + target["class_labels"] = torch.randint( + high=self.num_labels, size=(self.n_targets,), device=torch_device + ) + target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device) + target["masks"] = torch.rand(self.n_targets, self.min_size, self.max_size, device=torch_device) + labels.append(target) + + config = DetrConfig( + d_model=self.hidden_size, + encoder_layers=self.num_hidden_layers, + decoder_layers=self.num_hidden_layers, + encoder_attention_heads=self.num_attention_heads, + decoder_attention_heads=self.num_attention_heads, + encoder_ffn_dim=self.intermediate_size, + decoder_ffn_dim=self.intermediate_size, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + num_queries=self.num_queries, + num_labels=self.num_labels, + ) + return config, pixel_values, pixel_mask, labels + + def prepare_config_and_inputs_for_common(self): + config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs() + inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask} + return config, inputs_dict + + def create_and_check_detr_model(self, config, pixel_values, pixel_mask, labels): + model = DetrModel(config=config) + model.to(torch_device) + model.eval() + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask) + result = model(pixel_values) + + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, self.decoder_seq_length, self.hidden_size) + ) + + def create_and_check_detr_object_detection_head_model(self, config, pixel_values, pixel_mask, labels): + model = DetrForObjectDetection(config=config) + model.to(torch_device) + model.eval() + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask) + result = model(pixel_values) + + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1)) + self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels) + + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1)) + self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) + + +@require_timm +class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + DetrModel, + DetrForObjectDetection, + DetrForSegmentation, + ) + if is_timm_available() + else () + ) + is_encoder_decoder = True + test_torchscript = False + test_pruning = False + test_head_masking = False + test_missing_keys = False + + # special case for head models + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class.__name__ in ["DetrForObjectDetection", "DetrForSegmentation"]: + labels = [] + for i in range(self.model_tester.batch_size): + target = {} + target["class_labels"] = torch.ones( + size=(self.model_tester.n_targets,), device=torch_device, dtype=torch.long + ) + target["boxes"] = torch.ones( + self.model_tester.n_targets, 4, device=torch_device, dtype=torch.float + ) + target["masks"] = torch.ones( + self.model_tester.n_targets, + self.model_tester.min_size, + self.model_tester.max_size, + device=torch_device, + dtype=torch.float, + ) + labels.append(target) + inputs_dict["labels"] = labels + + return inputs_dict + + def setUp(self): + self.model_tester = DetrModelTester(self) + self.config_tester = ConfigTester(self, config_class=DetrConfig, has_text_modality=False) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_detr_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_detr_model(*config_and_inputs) + + def test_detr_object_detection_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_detr_object_detection_head_model(*config_and_inputs) + + @unittest.skip(reason="DETR does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="DETR does not have a get_input_embeddings method") + def test_model_common_attributes(self): + pass + + @unittest.skip(reason="DETR is not a generative model") + def test_generate_without_input_ids(self): + pass + + @unittest.skip(reason="DETR does not use token embeddings") + def test_resize_tokens_embeddings(self): + pass + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + decoder_seq_length = self.model_tester.decoder_seq_length + encoder_seq_length = self.model_tester.encoder_seq_length + decoder_key_length = self.model_tester.decoder_seq_length + encoder_key_length = self.model_tester.encoder_seq_length + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + ) + out_len = len(outputs) + + if self.is_encoder_decoder: + correct_outlen = 5 + + # loss is at first position + if "labels" in inputs_dict: + correct_outlen += 1 # loss is added to beginning + # Object Detection model returns pred_logits and pred_boxes + if model_class.__name__ == "DetrForObjectDetection": + correct_outlen += 2 + # Panoptic Segmentation model returns pred_logits, pred_boxes, pred_masks + if model_class.__name__ == "DetrForSegmentation": + correct_outlen += 3 + if "past_key_values" in outputs: + correct_outlen += 1 # past_key_values have been returned + + self.assertEqual(out_len, correct_outlen) + + # decoder attentions + decoder_attentions = outputs.decoder_attentions + self.assertIsInstance(decoder_attentions, (list, tuple)) + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length], + ) + + # cross attentions + cross_attentions = outputs.cross_attentions + self.assertIsInstance(cross_attentions, (list, tuple)) + self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(cross_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + decoder_seq_length, + encoder_key_length, + ], + ) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if hasattr(self.model_tester, "num_hidden_states_types"): + added_hidden_states = self.model_tester.num_hidden_states_types + elif self.is_encoder_decoder: + added_hidden_states = 2 + else: + added_hidden_states = 1 + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + ) + + def test_retain_grad_hidden_states_attentions(self): + # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + config.output_attentions = True + + # no need to test all models as different heads yield the same functionality + model_class = self.all_model_classes[0] + model = model_class(config) + model.to(torch_device) + + inputs = self._prepare_for_class(inputs_dict, model_class) + + outputs = model(**inputs) + + output = outputs[0] + + encoder_hidden_states = outputs.encoder_hidden_states[0] + encoder_attentions = outputs.encoder_attentions[0] + encoder_hidden_states.retain_grad() + encoder_attentions.retain_grad() + + decoder_attentions = outputs.decoder_attentions[0] + decoder_attentions.retain_grad() + + cross_attentions = outputs.cross_attentions[0] + cross_attentions.retain_grad() + + output.flatten()[0].backward(retain_graph=True) + + self.assertIsNotNone(encoder_hidden_states.grad) + self.assertIsNotNone(encoder_attentions.grad) + self.assertIsNotNone(decoder_attentions.grad) + self.assertIsNotNone(cross_attentions.grad) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + if model.config.is_encoder_decoder: + expected_arg_names = ["pixel_values", "pixel_mask"] + expected_arg_names.extend( + ["head_mask", "decoder_head_mask", "encoder_outputs"] + if "head_mask" and "decoder_head_mask" in arg_names + else [] + ) + self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) + else: + expected_arg_names = ["pixel_values", "pixel_mask"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_different_timm_backbone(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # let's pick a random timm backbone + config.backbone = "tf_mobilenetv3_small_075" + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if model_class.__name__ == "DetrForObjectDetection": + expected_shape = ( + self.model_tester.batch_size, + self.model_tester.num_queries, + self.model_tester.num_labels + 1, + ) + self.assertEqual(outputs.logits.shape, expected_shape) + + self.assertTrue(outputs) + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + configs_no_init.init_xavier_std = 1e9 + + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + if "bbox_attention" in name and "bias" not in name: + self.assertLess( + 100000, + abs(param.data.max().item()), + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + else: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + +TOLERANCE = 1e-4 + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_timm +@require_vision +@slow +class DetrModelIntegrationTests(unittest.TestCase): + @cached_property + def default_feature_extractor(self): + return DetrFeatureExtractor.from_pretrained("facebook/detr-resnet-50") if is_vision_available() else None + + def test_inference_no_head(self): + model = DetrModel.from_pretrained("facebook/detr-resnet-50").to(torch_device) + + feature_extractor = self.default_feature_extractor + image = prepare_img() + encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device) + + with torch.no_grad(): + outputs = model(**encoding) + + expected_shape = torch.Size((1, 100, 256)) + assert outputs.last_hidden_state.shape == expected_shape + expected_slice = torch.tensor( + [[0.0616, -0.5146, -0.4032], [-0.7629, -0.4934, -1.7153], [-0.4768, -0.6403, -0.7826]] + ).to(torch_device) + self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4)) + + def test_inference_object_detection_head(self): + model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(torch_device) + + feature_extractor = self.default_feature_extractor + image = prepare_img() + encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device) + pixel_values = encoding["pixel_values"].to(torch_device) + pixel_mask = encoding["pixel_mask"].to(torch_device) + + with torch.no_grad(): + outputs = model(pixel_values, pixel_mask) + + expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels + 1)) + self.assertEqual(outputs.logits.shape, expected_shape_logits) + expected_slice_logits = torch.tensor( + [[-19.1194, -0.0893, -11.0154], [-17.3640, -1.8035, -14.0219], [-20.0461, -0.5837, -11.1060]] + ).to(torch_device) + self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4)) + + expected_shape_boxes = torch.Size((1, model.config.num_queries, 4)) + self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes) + expected_slice_boxes = torch.tensor( + [[0.4433, 0.5302, 0.8853], [0.5494, 0.2517, 0.0529], [0.4998, 0.5360, 0.9956]] + ).to(torch_device) + self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4)) + + def test_inference_panoptic_segmentation_head(self): + model = DetrForSegmentation.from_pretrained("facebook/detr-resnet-50-panoptic").to(torch_device) + + feature_extractor = self.default_feature_extractor + image = prepare_img() + encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device) + pixel_values = encoding["pixel_values"].to(torch_device) + pixel_mask = encoding["pixel_mask"].to(torch_device) + + with torch.no_grad(): + outputs = model(pixel_values, pixel_mask) + + expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels + 1)) + self.assertEqual(outputs.logits.shape, expected_shape_logits) + expected_slice_logits = torch.tensor( + [[-18.1565, -1.7568, -13.5029], [-16.8888, -1.4138, -14.1028], [-17.5709, -2.5080, -11.8654]] + ).to(torch_device) + self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4)) + + expected_shape_boxes = torch.Size((1, model.config.num_queries, 4)) + self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes) + expected_slice_boxes = torch.tensor( + [[0.5344, 0.1789, 0.9285], [0.4420, 0.0572, 0.0875], [0.6630, 0.6887, 0.1017]] + ).to(torch_device) + self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4)) + + expected_shape_masks = torch.Size((1, model.config.num_queries, 200, 267)) + self.assertEqual(outputs.pred_masks.shape, expected_shape_masks) + expected_slice_masks = torch.tensor( + [[-7.7558, -10.8788, -11.9797], [-11.8881, -16.4329, -17.7451], [-14.7316, -19.7383, -20.3004]] + ).to(torch_device) + self.assertTrue(torch.allclose(outputs.pred_masks[0, 0, :3, :3], expected_slice_masks, atol=1e-4)) diff --git a/tests/test_modeling_vit.py b/tests/test_modeling_vit.py index b5436b7dc0e..09d4fa372a5 100644 --- a/tests/test_modeling_vit.py +++ b/tests/test_modeling_vit.py @@ -322,7 +322,7 @@ class ViTModelTest(ModelTesterMixin, unittest.TestCase): # We will verify our results on an image of cute cats def prepare_img(): - image = Image.open("./tests/fixtures/tests_samples/COCO/cats.png") + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") return image diff --git a/tests/test_pipelines_image_classification.py b/tests/test_pipelines_image_classification.py index 32b13174613..ecfab4c76dd 100644 --- a/tests/test_pipelines_image_classification.py +++ b/tests/test_pipelines_image_classification.py @@ -47,11 +47,26 @@ class ImageClassificationPipelineTests(unittest.TestCase): "http://images.cocodataset.org/val2017/000000039769.jpg", ] }, - {"images": "tests/fixtures/coco.jpg"}, - {"images": ["tests/fixtures/coco.jpg", "tests/fixtures/coco.jpg"]}, - {"images": Image.open("tests/fixtures/coco.jpg")}, - {"images": [Image.open("tests/fixtures/coco.jpg"), Image.open("tests/fixtures/coco.jpg")]}, - {"images": [Image.open("tests/fixtures/coco.jpg"), "tests/fixtures/coco.jpg"]}, + {"images": "./tests/fixtures/tests_samples/COCO/000000039769.png"}, + { + "images": [ + "./tests/fixtures/tests_samples/COCO/000000039769.png", + "./tests/fixtures/tests_samples/COCO/000000039769.png", + ] + }, + {"images": Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")}, + { + "images": [ + Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"), + Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"), + ] + }, + { + "images": [ + Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"), + "./tests/fixtures/tests_samples/COCO/000000039769.png", + ] + }, ] def test_small_model_from_factory(self): diff --git a/utils/check_repo.py b/utils/check_repo.py index e0eed1dbe80..63499fe5f8d 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -38,6 +38,9 @@ IGNORE_NON_TESTED = [ "BigBirdPegasusEncoder", # Building part of bigger (tested) model. "BigBirdPegasusDecoder", # Building part of bigger (tested) model. "BigBirdPegasusDecoderWrapper", # Building part of bigger (tested) model. + "DetrEncoder", # Building part of bigger (tested) model. + "DetrDecoder", # Building part of bigger (tested) model. + "DetrDecoderWrapper", # Building part of bigger (tested) model. "M2M100Encoder", # Building part of bigger (tested) model. "M2M100Decoder", # Building part of bigger (tested) model. "Speech2TextEncoder", # Building part of bigger (tested) model. @@ -95,6 +98,7 @@ IGNORE_NON_AUTO_CONFIGURED = [ "CLIPVisionModel", "FlaxCLIPTextModel", "FlaxCLIPVisionModel", + "DetrForSegmentation", "DPRReader", "DPRSpanPredictor", "FlaubertForQuestionAnswering",