diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py index 161485f6faa..b2c11b90489 100644 --- a/.circleci/create_circleci_config.py +++ b/.circleci/create_circleci_config.py @@ -308,8 +308,12 @@ layoutlm_job = CircleCIJob( "pip install 'git+https://github.com/facebookresearch/detectron2.git'", "sudo apt install tesseract-ocr", "pip install pytesseract", + "pip install natten", + ], + tests_to_run=[ + "tests/models/*layoutlmv*", + "tests/models/*nat", ], - tests_to_run="tests/models/*layoutlmv*", pytest_num_workers=1, pytest_options={"durations": 100}, ) diff --git a/README.md b/README.md index 39d50828a86..902e5b3e001 100644 --- a/README.md +++ b/README.md @@ -295,6 +295,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko. 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. +1. **[DiNAT](https://huggingface.co/docs/transformers/main/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi. 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT. 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei. 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park. @@ -348,6 +349,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. +1. **[NAT](https://huggingface.co/docs/transformers/main/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi. 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. diff --git a/README_es.md b/README_es.md index 12786494b42..2aa6a93e8fc 100644 --- a/README_es.md +++ b/README_es.md @@ -295,6 +295,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko. 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. +1. **[DiNAT](https://huggingface.co/docs/transformers/main/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi. 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT. 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei. 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park. @@ -348,6 +349,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. +1. **[NAT](https://huggingface.co/docs/transformers/main/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi. 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. diff --git a/README_ja.md b/README_ja.md index 37e95944477..7a4cd9c833f 100644 --- a/README_ja.md +++ b/README_ja.md @@ -330,6 +330,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko. 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. +1. **[DiNAT](https://huggingface.co/docs/transformers/main/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi. 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT. 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei. 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park. @@ -383,6 +384,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. +1. **[NAT](https://huggingface.co/docs/transformers/main/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi. 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. diff --git a/README_ko.md b/README_ko.md index 36a0e45e1dc..8a45a918ba5 100644 --- a/README_ko.md +++ b/README_ko.md @@ -245,6 +245,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko. 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. +1. **[DiNAT](https://huggingface.co/docs/transformers/main/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi. 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) and a German version of DistilBERT. 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei. 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER) released with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park. @@ -298,6 +299,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. +1. **[NAT](https://huggingface.co/docs/transformers/main/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi. 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. diff --git a/README_zh-hans.md b/README_zh-hans.md index 8068ed15235..fc53f392c1b 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -269,6 +269,7 @@ conda install -c huggingface transformers 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (来自 Facebook) 伴随论文 [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) 由 Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou 发布。 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (来自 Facebook) 伴随论文 [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) 由 Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko 发布。 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (来自 Microsoft Research) 伴随论文 [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) 由 Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan 发布。 +1. **[DiNAT](https://huggingface.co/docs/transformers/main/model_doc/dinat)** (来自 SHI Labs) 伴随论文 [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) 由 Ali Hassani and Humphrey Shi 发布。 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (来自 HuggingFace), 伴随论文 [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) 由 Victor Sanh, Lysandre Debut and Thomas Wolf 发布。 同样的方法也应用于压缩 GPT-2 到 [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa 到 [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT 到 [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) 和德语版 DistilBERT。 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (来自 Microsoft Research) 伴随论文 [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) 由 Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei 发布。 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (来自 NAVER) 伴随论文 [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) 由 Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park 发布。 @@ -322,6 +323,7 @@ conda install -c huggingface transformers 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (来自 中国人民大学 AI Box) 伴随论文 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 由 Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 发布。 +1. **[NAT](https://huggingface.co/docs/transformers/main/model_doc/nat)** (来自 SHI Labs) 伴随论文 [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) 由 Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi 发布。 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (来自华为诺亚方舟实验室) 伴随论文 [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) 由 Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu 发布。 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (来自 Meta) 伴随论文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) 由 the NLLB team 发布。 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (来自 the University of Wisconsin - Madison) 伴随论文 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 由 Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index 1671d778018..bd1cb738731 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -281,6 +281,7 @@ conda install -c huggingface transformers 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko. 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. +1. **[DiNAT](https://huggingface.co/docs/transformers/main/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi. 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) and a German version of DistilBERT. 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei. 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER) released with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park. @@ -334,6 +335,7 @@ conda install -c huggingface transformers 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. +1. **[NAT](https://huggingface.co/docs/transformers/main/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi. 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index d441b370eed..eea208df1f6 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -396,6 +396,8 @@ title: DeiT - local: model_doc/detr title: DETR + - local: model_doc/dinat + title: DiNAT - local: model_doc/dit title: DiT - local: model_doc/dpt @@ -412,6 +414,8 @@ title: MobileNetV2 - local: model_doc/mobilevit title: MobileViT + - local: model_doc/nat + title: NAT - local: model_doc/poolformer title: PoolFormer - local: model_doc/regnet diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index 660ff81dde5..f1b75a982f2 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -83,6 +83,7 @@ The documentation is organized into five sections: 1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. 1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko. 1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. +1. **[DiNAT](model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi. 1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT. 1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei. 1. **[Donut](model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park. @@ -136,6 +137,7 @@ The documentation is organized into five sections: 1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. 1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. +1. **[NAT](model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi. 1. **[Nezha](model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. 1. **[NLLB](model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. 1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. @@ -244,6 +246,7 @@ Flax), PyTorch, and/or TensorFlow. | Deformable DETR | ❌ | ❌ | ✅ | ❌ | ❌ | | DeiT | ❌ | ❌ | ✅ | ✅ | ❌ | | DETR | ❌ | ❌ | ✅ | ❌ | ❌ | +| DiNAT | ❌ | ❌ | ✅ | ❌ | ❌ | | DistilBERT | ✅ | ✅ | ✅ | ✅ | ✅ | | DonutSwin | ❌ | ❌ | ✅ | ❌ | ❌ | | DPR | ✅ | ✅ | ✅ | ✅ | ❌ | @@ -290,6 +293,7 @@ Flax), PyTorch, and/or TensorFlow. | MPNet | ✅ | ✅ | ✅ | ✅ | ❌ | | MT5 | ✅ | ✅ | ✅ | ✅ | ✅ | | MVP | ✅ | ✅ | ✅ | ❌ | ❌ | +| NAT | ❌ | ❌ | ✅ | ❌ | ❌ | | Nezha | ❌ | ❌ | ✅ | ❌ | ❌ | | Nyströmformer | ❌ | ❌ | ✅ | ❌ | ❌ | | OpenAI GPT | ✅ | ✅ | ✅ | ✅ | ❌ | diff --git a/docs/source/en/model_doc/dinat.mdx b/docs/source/en/model_doc/dinat.mdx new file mode 100644 index 00000000000..c8cebd921e0 --- /dev/null +++ b/docs/source/en/model_doc/dinat.mdx @@ -0,0 +1,78 @@ + + +# Dilated Neighborhood Attention Transformer + +## Overview + +DiNAT was proposed in [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) +by Ali Hassani and Humphrey Shi. + +It extends [NAT](nat) by adding a Dilated Neighborhood Attention pattern to capture global context, +and shows significant performance improvements over it. + +The abstract from the paper is the following: + +*Transformers are quickly becoming one of the most heavily applied deep learning architectures across modalities, +domains, and tasks. In vision, on top of ongoing efforts into plain transformers, hierarchical transformers have +also gained significant attention, thanks to their performance and easy integration into existing frameworks. +These models typically employ localized attention mechanisms, such as the sliding-window Neighborhood Attention (NA) +or Swin Transformer's Shifted Window Self Attention. While effective at reducing self attention's quadratic complexity, +local attention weakens two of the most desirable properties of self attention: long range inter-dependency modeling, +and global receptive field. In this paper, we introduce Dilated Neighborhood Attention (DiNA), a natural, flexible and +efficient extension to NA that can capture more global context and expand receptive fields exponentially at no +additional cost. NA's local attention and DiNA's sparse global attention complement each other, and therefore we +introduce Dilated Neighborhood Attention Transformer (DiNAT), a new hierarchical vision transformer built upon both. +DiNAT variants enjoy significant improvements over strong baselines such as NAT, Swin, and ConvNeXt. +Our large model is faster and ahead of its Swin counterpart by 1.5% box AP in COCO object detection, +1.3% mask AP in COCO instance segmentation, and 1.1% mIoU in ADE20K semantic segmentation. +Paired with new frameworks, our large variant is the new state of the art panoptic segmentation model on COCO (58.2 PQ) +and ADE20K (48.5 PQ), and instance segmentation model on Cityscapes (44.5 AP) and ADE20K (35.4 AP) (no extra data). +It also matches the state of the art specialized semantic segmentation models on ADE20K (58.2 mIoU), +and ranks second on Cityscapes (84.5 mIoU) (no extra data). * + +Tips: +- One can use the [`AutoImageProcessor`] API to prepare images for the model. +- DiNAT can be used as a *backbone*. When `output_hidden_states = True`, +it will output both `hidden_states` and `reshaped_hidden_states`. The `reshaped_hidden_states` have a shape of `(batch, num_channels, height, width)` rather than `(batch_size, height, width, num_channels)`. + +Notes: +- DiNAT depends on [NATTEN](https://github.com/SHI-Labs/NATTEN/)'s implementation of Neighborhood Attention and Dilated Neighborhood Attention. +You can install it with pre-built wheels for Linux by referring to [shi-labs.com/natten](https://shi-labs.com/natten), or build on your system by running `pip install natten`. +Note that the latter will likely take time to compile. NATTEN does not support Windows devices yet. +- Patch size of 4 is only supported at the moment. + + + + Neighborhood Attention with different dilation values. +Taken from the original paper. + +This model was contributed by [Ali Hassani](https://huggingface.co/alihassanijr). +The original code can be found [here](https://github.com/SHI-Labs/Neighborhood-Attention-Transformer). + + +## DinatConfig + +[[autodoc]] DinatConfig + + +## DinatModel + +[[autodoc]] DinatModel + - forward + +## DinatForImageClassification + +[[autodoc]] DinatForImageClassification + - forward diff --git a/docs/source/en/model_doc/nat.mdx b/docs/source/en/model_doc/nat.mdx new file mode 100644 index 00000000000..43b59fb471e --- /dev/null +++ b/docs/source/en/model_doc/nat.mdx @@ -0,0 +1,73 @@ + + +# Neighborhood Attention Transformer + +## Overview + +NAT was proposed in [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) +by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi. + +It is a hierarchical vision transformer based on Neighborhood Attention, a sliding-window self attention pattern. + +The abstract from the paper is the following: + +*We present Neighborhood Attention (NA), the first efficient and scalable sliding-window attention mechanism for vision. +NA is a pixel-wise operation, localizing self attention (SA) to the nearest neighboring pixels, and therefore enjoys a +linear time and space complexity compared to the quadratic complexity of SA. The sliding-window pattern allows NA's +receptive field to grow without needing extra pixel shifts, and preserves translational equivariance, unlike +Swin Transformer's Window Self Attention (WSA). We develop NATTEN (Neighborhood Attention Extension), a Python package +with efficient C++ and CUDA kernels, which allows NA to run up to 40% faster than Swin's WSA while using up to 25% less +memory. We further present Neighborhood Attention Transformer (NAT), a new hierarchical transformer design based on NA +that boosts image classification and downstream vision performance. Experimental results on NAT are competitive; +NAT-Tiny reaches 83.2% top-1 accuracy on ImageNet, 51.4% mAP on MS-COCO and 48.4% mIoU on ADE20K, which is 1.9% +ImageNet accuracy, 1.0% COCO mAP, and 2.6% ADE20K mIoU improvement over a Swin model with similar size. * + +Tips: +- One can use the [`AutoImageProcessor`] API to prepare images for the model. +- NAT can be used as a *backbone*. When `output_hidden_states = True`, +it will output both `hidden_states` and `reshaped_hidden_states`. +The `reshaped_hidden_states` have a shape of `(batch, num_channels, height, width)` rather than +`(batch_size, height, width, num_channels)`. + +Notes: +- NAT depends on [NATTEN](https://github.com/SHI-Labs/NATTEN/)'s implementation of Neighborhood Attention. +You can install it with pre-built wheels for Linux by referring to [shi-labs.com/natten](https://shi-labs.com/natten), +or build on your system by running `pip install natten`. +Note that the latter will likely take time to compile. NATTEN does not support Windows devices yet. +- Patch size of 4 is only supported at the moment. + + + + Neighborhood Attention compared to other attention patterns. +Taken from the original paper. + +This model was contributed by [Ali Hassani](https://huggingface.co/alihassanijr). +The original code can be found [here](https://github.com/SHI-Labs/Neighborhood-Attention-Transformer). + + +## NatConfig + +[[autodoc]] NatConfig + + +## NatModel + +[[autodoc]] NatModel + - forward + +## NatForImageClassification + +[[autodoc]] NatForImageClassification + - forward diff --git a/setup.py b/setup.py index a94793997f6..ef533058b7b 100644 --- a/setup.py +++ b/setup.py @@ -125,6 +125,7 @@ _deps = [ "jieba", "kenlm", "nltk", + "natten>=0.14.4", "numpy>=1.17", "onnxconverter-common", "onnxruntime-tools>=1.4.2", @@ -283,6 +284,7 @@ extras["tf-speech"] = extras["audio"] extras["flax-speech"] = extras["audio"] extras["vision"] = deps_list("Pillow") extras["timm"] = deps_list("timm") +extras["natten"] = deps_list("natten") extras["codecarbon"] = deps_list("codecarbon") diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 8079ea61e30..4a050097daa 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -202,6 +202,7 @@ _import_structure = { "models.deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig"], "models.detr": ["DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DetrConfig"], "models.dialogpt": [], + "models.dinat": ["DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DinatConfig"], "models.distilbert": ["DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DistilBertConfig", "DistilBertTokenizer"], "models.dit": [], "models.donut": ["DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP", "DonutProcessor", "DonutSwinConfig"], @@ -304,6 +305,7 @@ _import_structure = { "models.mpnet": ["MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "MPNetConfig", "MPNetTokenizer"], "models.mt5": ["MT5Config"], "models.mvp": ["MvpConfig", "MvpTokenizer"], + "models.nat": ["NAT_PRETRAINED_CONFIG_ARCHIVE_MAP", "NatConfig"], "models.nezha": ["NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP", "NezhaConfig"], "models.nllb": [], "models.nystromformer": [ @@ -1239,6 +1241,14 @@ else: "DistilBertPreTrainedModel", ] ) + _import_structure["models.dinat"].extend( + [ + "DINAT_PRETRAINED_MODEL_ARCHIVE_LIST", + "DinatForImageClassification", + "DinatModel", + "DinatPreTrainedModel", + ] + ) _import_structure["models.donut"].extend( [ "DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -1691,6 +1701,14 @@ else: ] ) _import_structure["models.mt5"].extend(["MT5EncoderModel", "MT5ForConditionalGeneration", "MT5Model"]) + _import_structure["models.nat"].extend( + [ + "NAT_PRETRAINED_MODEL_ARCHIVE_LIST", + "NatForImageClassification", + "NatModel", + "NatPreTrainedModel", + ] + ) _import_structure["models.nezha"].extend( [ "NEZHA_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -3344,6 +3362,7 @@ if TYPE_CHECKING: from .models.deformable_detr import DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DeformableDetrConfig from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig from .models.detr import DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DetrConfig + from .models.dinat import DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP, DinatConfig from .models.distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig, DistilBertTokenizer from .models.donut import DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP, DonutProcessor, DonutSwinConfig from .models.dpr import ( @@ -3439,6 +3458,7 @@ if TYPE_CHECKING: from .models.mpnet import MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP, MPNetConfig, MPNetTokenizer from .models.mt5 import MT5Config from .models.mvp import MvpConfig, MvpTokenizer + from .models.nat import NAT_PRETRAINED_CONFIG_ARCHIVE_MAP, NatConfig from .models.nezha import NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP, NezhaConfig from .models.nystromformer import NYSTROMFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, NystromformerConfig from .models.openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig, OpenAIGPTTokenizer @@ -4197,6 +4217,12 @@ if TYPE_CHECKING: DeiTModel, DeiTPreTrainedModel, ) + from .models.dinat import ( + DINAT_PRETRAINED_MODEL_ARCHIVE_LIST, + DinatForImageClassification, + DinatModel, + DinatPreTrainedModel, + ) from .models.distilbert import ( DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST, DistilBertForMaskedLM, @@ -4579,6 +4605,12 @@ if TYPE_CHECKING: MvpModel, MvpPreTrainedModel, ) + from .models.nat import ( + NAT_PRETRAINED_MODEL_ARCHIVE_LIST, + NatForImageClassification, + NatModel, + NatPreTrainedModel, + ) from .models.nezha import ( NEZHA_PRETRAINED_MODEL_ARCHIVE_LIST, NezhaForMaskedLM, diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 0557e84aa53..b5fd55d1a75 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -31,6 +31,7 @@ deps = { "jieba": "jieba", "kenlm": "kenlm", "nltk": "nltk", + "natten": "natten>=0.14.4", "numpy": "numpy>=1.17", "onnxconverter-common": "onnxconverter-common", "onnxruntime-tools": "onnxruntime-tools>=1.4.2", diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 1020fd78ec2..8c29d868a62 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -53,6 +53,7 @@ from . import ( deit, detr, dialogpt, + dinat, distilbert, dit, donut, @@ -107,6 +108,7 @@ from . import ( mpnet, mt5, mvp, + nat, nezha, nllb, nystromformer, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index c9d99e3e30d..b569487ca8c 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -58,6 +58,7 @@ CONFIG_MAPPING_NAMES = OrderedDict( ("deformable_detr", "DeformableDetrConfig"), ("deit", "DeiTConfig"), ("detr", "DetrConfig"), + ("dinat", "DinatConfig"), ("distilbert", "DistilBertConfig"), ("donut-swin", "DonutSwinConfig"), ("dpr", "DPRConfig"), @@ -105,6 +106,7 @@ CONFIG_MAPPING_NAMES = OrderedDict( ("mpnet", "MPNetConfig"), ("mt5", "MT5Config"), ("mvp", "MvpConfig"), + ("nat", "NatConfig"), ("nezha", "NezhaConfig"), ("nystromformer", "NystromformerConfig"), ("openai-gpt", "OpenAIGPTConfig"), @@ -202,6 +204,7 @@ CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict( ("deformable_detr", "DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("deit", "DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("detr", "DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("dinat", "DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("distilbert", "DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("donut-swin", "DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("dpr", "DPR_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -245,6 +248,7 @@ CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict( ("mobilevit", "MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("mpnet", "MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("mvp", "MVP_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("nat", "NAT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("nezha", "NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("nystromformer", "NYSTROMFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("openai-gpt", "OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -343,6 +347,7 @@ MODEL_NAMES_MAPPING = OrderedDict( ("deit", "DeiT"), ("detr", "DETR"), ("dialogpt", "DialoGPT"), + ("dinat", "DiNAT"), ("distilbert", "DistilBERT"), ("dit", "DiT"), ("donut-swin", "DonutSwin"), @@ -397,6 +402,7 @@ MODEL_NAMES_MAPPING = OrderedDict( ("mpnet", "MPNet"), ("mt5", "MT5"), ("mvp", "MVP"), + ("nat", "NAT"), ("nezha", "Nezha"), ("nllb", "NLLB"), ("nystromformer", "Nyströmformer"), diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py index 7eff2c2837e..414845bb410 100644 --- a/src/transformers/models/auto/feature_extraction_auto.py +++ b/src/transformers/models/auto/feature_extraction_auto.py @@ -48,6 +48,7 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict( ("deformable_detr", "DeformableDetrFeatureExtractor"), ("deit", "DeiTFeatureExtractor"), ("detr", "DetrFeatureExtractor"), + ("dinat", "ViTFeatureExtractor"), ("donut-swin", "DonutFeatureExtractor"), ("dpt", "DPTFeatureExtractor"), ("flava", "FlavaFeatureExtractor"), @@ -62,6 +63,7 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict( ("mctct", "MCTCTFeatureExtractor"), ("mobilenet_v2", "MobileNetV2FeatureExtractor"), ("mobilevit", "MobileViTFeatureExtractor"), + ("nat", "ViTFeatureExtractor"), ("owlvit", "OwlViTFeatureExtractor"), ("perceiver", "PerceiverFeatureExtractor"), ("poolformer", "PoolFormerFeatureExtractor"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 72492f4cab7..f57a70dc2ae 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -43,6 +43,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict( ("cvt", "ConvNextImageProcessor"), ("data2vec-vision", "BeitImageProcessor"), ("deit", "DeiTImageProcessor"), + ("dinat", "ViTImageProcessor"), ("dpt", "DPTImageProcessor"), ("flava", "FlavaImageProcessor"), ("glpn", "GLPNImageProcessor"), @@ -53,6 +54,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict( ("levit", "LevitImageProcessor"), ("mobilenet_v2", "MobileNetV2ImageProcessor"), ("mobilevit", "MobileViTImageProcessor"), + ("nat", "ViTImageProcessor"), ("perceiver", "PerceiverImageProcessor"), ("poolformer", "PoolFormerImageProcessor"), ("regnet", "ConvNextImageProcessor"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 11c841407bd..1fa7de8b98e 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -58,6 +58,7 @@ MODEL_MAPPING_NAMES = OrderedDict( ("deformable_detr", "DeformableDetrModel"), ("deit", "DeiTModel"), ("detr", "DetrModel"), + ("dinat", "DinatModel"), ("distilbert", "DistilBertModel"), ("donut-swin", "DonutSwinModel"), ("dpr", "DPRQuestionEncoder"), @@ -104,6 +105,7 @@ MODEL_MAPPING_NAMES = OrderedDict( ("mpnet", "MPNetModel"), ("mt5", "MT5Model"), ("mvp", "MvpModel"), + ("nat", "NatModel"), ("nezha", "NezhaModel"), ("nllb", "M2M100Model"), ("nystromformer", "NystromformerModel"), @@ -367,10 +369,12 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict( ("cvt", "CvtForImageClassification"), ("data2vec-vision", "Data2VecVisionForImageClassification"), ("deit", ("DeiTForImageClassification", "DeiTForImageClassificationWithTeacher")), + ("dinat", "DinatForImageClassification"), ("imagegpt", "ImageGPTForImageClassification"), ("levit", ("LevitForImageClassification", "LevitForImageClassificationWithTeacher")), ("mobilenet_v2", "MobileNetV2ForImageClassification"), ("mobilevit", "MobileViTForImageClassification"), + ("nat", "NatForImageClassification"), ( "perceiver", ( diff --git a/src/transformers/models/dinat/__init__.py b/src/transformers/models/dinat/__init__.py new file mode 100644 index 00000000000..14ee01c6689 --- /dev/null +++ b/src/transformers/models/dinat/__init__.py @@ -0,0 +1,59 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all. + +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +# rely on isort to merge the imports +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available + + +_import_structure = {"configuration_dinat": ["DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DinatConfig"]} + + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_dinat"] = [ + "DINAT_PRETRAINED_MODEL_ARCHIVE_LIST", + "DinatForImageClassification", + "DinatModel", + "DinatPreTrainedModel", + ] + +if TYPE_CHECKING: + from .configuration_dinat import DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP, DinatConfig + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_dinat import ( + DINAT_PRETRAINED_MODEL_ARCHIVE_LIST, + DinatForImageClassification, + DinatModel, + DinatPreTrainedModel, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/dinat/configuration_dinat.py b/src/transformers/models/dinat/configuration_dinat.py new file mode 100644 index 00000000000..d51fc660d1c --- /dev/null +++ b/src/transformers/models/dinat/configuration_dinat.py @@ -0,0 +1,136 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Dilated Neighborhood Attention Transformer model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + +DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "shi-labs/dinat-mini-in1k-224": "https://huggingface.co/shi-labs/dinat-mini-in1k-224/resolve/main/config.json", + # See all Dinat models at https://huggingface.co/models?filter=dinat +} + + +class DinatConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DinatModel`]. It is used to instantiate a Dinat + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Dinat + [shi-labs/dinat-mini-in1k-224](https://huggingface.co/shi-labs/dinat-mini-in1k-224) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + patch_size (`int`, *optional*, defaults to 4): + The size (resolution) of each patch. NOTE: Only patch size of 4 is supported at the moment. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + embed_dim (`int`, *optional*, defaults to 64): + Dimensionality of patch embedding. + depths (`List[int]`, *optional*, defaults to `[2, 2, 6, 2]`): + Number of layers in each level of the encoder. + num_heads (`List[int]`, *optional*, defaults to `[3, 6, 12, 24]`): + Number of attention heads in each layer of the Transformer encoder. + kernel_size (`int`, *optional*, defaults to 7): + Neighborhood Attention kernel size. + dilations (`List[List[int]]`, *optional*, defaults to `[[1, 8, 1], [1, 4, 1, 4], [1, 2, 1, 2, 1, 2], [1, 1, 1, 1, 1]]`): + Dilation value of each NA layer in the Transformer encoder. + mlp_ratio (`float`, *optional*, defaults to 3.0): + Ratio of MLP hidden dimensionality to embedding dimensionality. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether or not a learnable bias should be added to the queries, keys and values. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings and encoder. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + drop_path_rate (`float`, *optional*, defaults to 0.1): + Stochastic depth rate. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`, + `"selu"` and `"gelu_new"` are supported. + patch_norm (`bool`, *optional*, defaults to `True`): + Whether or not to add layer normalization after patch embedding. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + + Example: + + ```python + >>> from transformers import DinatConfig, DinatModel + + >>> # Initializing a Dinat shi-labs/dinat-mini-in1k-224 style configuration + >>> configuration = DinatConfig() + + >>> # Initializing a model (with random weights) from the shi-labs/dinat-mini-in1k-224 style configuration + >>> model = DinatModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "dinat" + + attribute_map = { + "num_attention_heads": "num_heads", + "num_hidden_layers": "num_layers", + } + + def __init__( + self, + patch_size=4, + num_channels=3, + embed_dim=64, + depths=[3, 4, 6, 5], + num_heads=[2, 4, 8, 16], + kernel_size=7, + dilations=[[1, 8, 1], [1, 4, 1, 4], [1, 2, 1, 2, 1, 2], [1, 1, 1, 1, 1]], + mlp_ratio=3.0, + qkv_bias=True, + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + drop_path_rate=0.1, + hidden_act="gelu", + patch_norm=True, + initializer_range=0.02, + layer_norm_eps=1e-5, + **kwargs + ): + super().__init__(**kwargs) + + self.patch_size = patch_size + self.num_channels = num_channels + self.embed_dim = embed_dim + self.depths = depths + self.num_layers = len(depths) + self.num_heads = num_heads + self.kernel_size = kernel_size + self.dilations = dilations + self.mlp_ratio = mlp_ratio + self.qkv_bias = qkv_bias + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.drop_path_rate = drop_path_rate + self.hidden_act = hidden_act + self.path_norm = patch_norm + self.layer_norm_eps = layer_norm_eps + self.initializer_range = initializer_range + # we set the hidden_size attribute in order to make Dinat work with VisionEncoderDecoderModel + # this indicates the channel dimension after the last stage of the model + self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1)) diff --git a/src/transformers/models/dinat/modeling_dinat.py b/src/transformers/models/dinat/modeling_dinat.py new file mode 100644 index 00000000000..ba78a00b199 --- /dev/null +++ b/src/transformers/models/dinat/modeling_dinat.py @@ -0,0 +1,861 @@ +# coding=utf-8 +# Copyright 2022 SHI Labs and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Dilated Neighborhood Attention Transformer model.""" + + +import math +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + ModelOutput, + OptionalDependencyNotAvailable, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_natten_available, + logging, + requires_backends, +) +from .configuration_dinat import DinatConfig + + +if is_natten_available(): + from natten.functional import natten2dav, natten2dqkrpb +else: + + def natten2dqkrpb(*args, **kwargs): + raise OptionalDependencyNotAvailable() + + def natten2dav(*args, **kwargs): + raise OptionalDependencyNotAvailable() + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "DinatConfig" +_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor" + +# Base docstring +_CHECKPOINT_FOR_DOC = "shi-labs/dinat-mini-in1k-224" +_EXPECTED_OUTPUT_SHAPE = [1, 7, 7, 512] + +# Image classification docstring +_IMAGE_CLASS_CHECKPOINT = "shi-labs/dinat-mini-in1k-224" +_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat" + + +DINAT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "shi-labs/dinat-mini-in1k-224", + # See all Dinat models at https://huggingface.co/models?filter=dinat +] + +# drop_path and DinatDropPath are from the timm library. + + +@dataclass +# Copied from transformers.models.nat.modeling_nat.NatEncoderOutput with Nat->Dinat +class DinatEncoderOutput(ModelOutput): + """ + Dinat encoder's outputs, with potential hidden states and attentions. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, hidden_size, height, width)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + last_hidden_state: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +# Copied from transformers.models.nat.modeling_nat.NatModelOutput with Nat->Dinat +class DinatModelOutput(ModelOutput): + """ + Dinat model's outputs that also contains a pooling of the last hidden states. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed): + Average pooling of the last layer hidden-state. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, hidden_size, height, width)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + last_hidden_state: torch.FloatTensor = None + pooler_output: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +# Copied from transformers.models.nat.modeling_nat.NatImageClassifierOutput with Nat->Dinat +class DinatImageClassifierOutput(ModelOutput): + """ + Dinat outputs for image classification. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, hidden_size, height, width)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + + +# Copied from transformers.models.nat.modeling_nat.NatEmbeddings with Nat->Dinat +class DinatEmbeddings(nn.Module): + """ + Construct the patch and position embeddings. + """ + + def __init__(self, config): + super().__init__() + + self.patch_embeddings = DinatPatchEmbeddings(config) + + self.norm = nn.LayerNorm(config.embed_dim) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor]: + embeddings = self.patch_embeddings(pixel_values) + embeddings = self.norm(embeddings) + + embeddings = self.dropout(embeddings) + + return embeddings + + +# Copied from transformers.models.nat.modeling_nat.NatPatchEmbeddings with Nat->Dinat +class DinatPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, height, width, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + patch_size = config.patch_size + num_channels, hidden_size = config.num_channels, config.embed_dim + self.num_channels = num_channels + + if patch_size == 4: + pass + else: + # TODO: Support arbitrary patch sizes. + raise ValueError("Dinat only supports patch size of 4 at the moment.") + + self.projection = nn.Sequential( + nn.Conv2d(self.num_channels, hidden_size // 2, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)), + nn.Conv2d(hidden_size // 2, hidden_size, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)), + ) + + def forward(self, pixel_values: Optional[torch.FloatTensor]) -> torch.Tensor: + _, num_channels, height, width = pixel_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + embeddings = self.projection(pixel_values) + embeddings = embeddings.permute(0, 2, 3, 1) + + return embeddings + + +# Copied from transformers.models.nat.modeling_nat.NatDownsampler with Nat->Dinat +class DinatDownsampler(nn.Module): + """ + Convolutional Downsampling Layer. + + Args: + dim (`int`): + Number of input channels. + norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`): + Normalization layer class. + """ + + def __init__(self, dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None: + super().__init__() + self.dim = dim + self.reduction = nn.Conv2d(dim, 2 * dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) + self.norm = norm_layer(2 * dim) + + def forward(self, input_feature: torch.Tensor) -> torch.Tensor: + input_feature = self.reduction(input_feature.permute(0, 3, 1, 2)).permute(0, 2, 3, 1) + input_feature = self.norm(input_feature) + return input_feature + + +# Copied from transformers.models.beit.modeling_beit.drop_path +def drop_path(input, drop_prob=0.0, training=False, scale_by_keep=True): + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Dinat +class DinatDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return drop_path(x, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return "p={}".format(self.drop_prob) + + +class NeighborhoodAttention(nn.Module): + def __init__(self, config, dim, num_heads, kernel_size, dilation): + super().__init__() + if dim % num_heads != 0: + raise ValueError( + f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})" + ) + + self.num_attention_heads = num_heads + self.attention_head_size = int(dim / num_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.kernel_size = kernel_size + self.dilation = dilation + + # rpb is learnable relative positional biases; same concept is used Swin. + self.rpb = nn.Parameter(torch.zeros(num_heads, (2 * self.kernel_size - 1), (2 * self.kernel_size - 1))) + + self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + # Copied from transformers.models.nat.modeling_nat.NeighborhoodAttention.transpose_for_scores with Nat->Dinat + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 3, 1, 2, 4) + + def forward( + self, + hidden_states: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + + query_layer = self.transpose_for_scores(self.query(hidden_states)) + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + # Apply the scale factor before computing attention weights. It's usually more efficient because + # attention weights are typically a bigger tensor compared to query. + # It gives identical results because scalars are commutable in matrix multiplication. + query_layer = query_layer / math.sqrt(self.attention_head_size) + + # Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases. + attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.dilation) + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + context_layer = natten2dav(attention_probs, value_layer, self.dilation) + context_layer = context_layer.permute(0, 2, 3, 1, 4).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +# Copied from transformers.models.nat.modeling_nat.NeighborhoodAttentionOutput +class NeighborhoodAttentionOutput(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(dim, dim) + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +class NeighborhoodAttentionModule(nn.Module): + def __init__(self, config, dim, num_heads, kernel_size, dilation): + super().__init__() + self.self = NeighborhoodAttention(config, dim, num_heads, kernel_size, dilation) + self.output = NeighborhoodAttentionOutput(config, dim) + self.pruned_heads = set() + + # Copied from transformers.models.nat.modeling_nat.NeighborhoodAttentionModule.prune_heads + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + # Copied from transformers.models.nat.modeling_nat.NeighborhoodAttentionModule.forward + def forward( + self, + hidden_states: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.self(hidden_states, output_attentions) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.nat.modeling_nat.NatIntermediate with Nat->Dinat +class DinatIntermediate(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(dim, int(config.mlp_ratio * dim)) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.nat.modeling_nat.NatOutput with Nat->Dinat +class DinatOutput(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(int(config.mlp_ratio * dim), dim) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +class DinatLayer(nn.Module): + def __init__(self, config, dim, num_heads, dilation, drop_path_rate=0.0): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.kernel_size = config.kernel_size + self.dilation = dilation + self.window_size = self.kernel_size * self.dilation + self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) + self.attention = NeighborhoodAttentionModule( + config, dim, num_heads, kernel_size=self.kernel_size, dilation=self.dilation + ) + self.drop_path = DinatDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() + self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps) + self.intermediate = DinatIntermediate(config, dim) + self.output = DinatOutput(config, dim) + + def maybe_pad(self, hidden_states, height, width): + window_size = self.window_size + pad_values = (0, 0, 0, 0, 0, 0) + if height < window_size or width < window_size: + pad_l = pad_t = 0 + pad_r = max(0, window_size - width) + pad_b = max(0, window_size - height) + pad_values = (0, 0, pad_l, pad_r, pad_t, pad_b) + hidden_states = nn.functional.pad(hidden_states, pad_values) + return hidden_states, pad_values + + def forward( + self, + hidden_states: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, torch.Tensor]: + batch_size, height, width, channels = hidden_states.size() + shortcut = hidden_states + + hidden_states = self.layernorm_before(hidden_states) + # pad hidden_states if they are smaller than kernel size x dilation + hidden_states, pad_values = self.maybe_pad(hidden_states, height, width) + + _, height_pad, width_pad, _ = hidden_states.shape + + attention_outputs = self.attention(hidden_states, output_attentions=output_attentions) + + attention_output = attention_outputs[0] + + was_padded = pad_values[3] > 0 or pad_values[5] > 0 + if was_padded: + attention_output = attention_output[:, :height, :width, :].contiguous() + + hidden_states = shortcut + self.drop_path(attention_output) + + layer_output = self.layernorm_after(hidden_states) + layer_output = self.intermediate(layer_output) + layer_output = hidden_states + self.output(layer_output) + + layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,) + return layer_outputs + + +class DinatStage(nn.Module): + def __init__(self, config, dim, depth, num_heads, dilations, drop_path_rate, downsample): + super().__init__() + self.config = config + self.dim = dim + self.layers = nn.ModuleList( + [ + DinatLayer( + config=config, + dim=dim, + num_heads=num_heads, + dilation=dilations[i], + drop_path_rate=drop_path_rate[i], + ) + for i in range(depth) + ] + ) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(dim=dim, norm_layer=nn.LayerNorm) + else: + self.downsample = None + + self.pointing = False + + # Copied from transformers.models.nat.modeling_nat.NatStage.forward + def forward( + self, + hidden_states: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + _, height, width, _ = hidden_states.size() + for i, layer_module in enumerate(self.layers): + layer_outputs = layer_module(hidden_states, output_attentions) + hidden_states = layer_outputs[0] + + if self.downsample is not None: + height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2 + output_dimensions = (height, width, height_downsampled, width_downsampled) + hidden_states = self.downsample(layer_outputs[0]) + else: + output_dimensions = (height, width, height, width) + + stage_outputs = (hidden_states, output_dimensions) + + if output_attentions: + stage_outputs += layer_outputs[1:] + return stage_outputs + + +class DinatEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.num_levels = len(config.depths) + self.config = config + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))] + self.levels = nn.ModuleList( + [ + DinatStage( + config=config, + dim=int(config.embed_dim * 2**i_layer), + depth=config.depths[i_layer], + num_heads=config.num_heads[i_layer], + dilations=config.dilations[i_layer], + drop_path_rate=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])], + downsample=DinatDownsampler if (i_layer < self.num_levels - 1) else None, + ) + for i_layer in range(self.num_levels) + ] + ) + + # Copied from transformers.models.nat.modeling_nat.NatEncoder.forward with Nat->Dinat + def forward( + self, + hidden_states: torch.Tensor, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[Tuple, DinatEncoderOutput]: + all_hidden_states = () if output_hidden_states else None + all_reshaped_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if output_hidden_states: + # rearrange b h w c -> b c h w + reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2) + all_hidden_states += (hidden_states,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + + for i, layer_module in enumerate(self.levels): + layer_outputs = layer_module(hidden_states, output_attentions) + + hidden_states = layer_outputs[0] + + if output_hidden_states: + # rearrange b h w c -> b c h w + reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2) + all_hidden_states += (hidden_states,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + + if output_attentions: + all_self_attentions += layer_outputs[2:] + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + + return DinatEncoderOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + reshaped_hidden_states=all_reshaped_hidden_states, + ) + + +class DinatPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DinatConfig + base_model_prefix = "dinat" + main_input_name = "pixel_values" + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def _set_gradient_checkpointing(self, module: DinatEncoder, value: bool = False) -> None: + pass + + +DINAT_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use + it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`DinatConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +DINAT_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See + [`AutoImageProcessor.__call__`] for details. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Dinat Model transformer outputting raw hidden-states without any specific head on top.", + DINAT_START_DOCSTRING, +) +# Copied from transformers.models.nat.modeling_nat.NatModel with Nat->Dinat, NAT->DINAT +class DinatModel(DinatPreTrainedModel): + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + + requires_backends(self, ["natten"]) + + self.config = config + self.num_levels = len(config.depths) + self.num_features = int(config.embed_dim * 2 ** (self.num_levels - 1)) + + self.embeddings = DinatEmbeddings(config) + self.encoder = DinatEncoder(config) + + self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps) + self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.patch_embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(DINAT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + processor_class=_FEAT_EXTRACTOR_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=DinatModelOutput, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, DinatModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + embedding_output = self.embeddings(pixel_values) + + encoder_outputs = self.encoder( + embedding_output, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output) + + pooled_output = None + if self.pooler is not None: + pooled_output = self.pooler(sequence_output.flatten(1, 2).transpose(1, 2)) + pooled_output = torch.flatten(pooled_output, 1) + + if not return_dict: + output = (sequence_output, pooled_output) + encoder_outputs[1:] + + return output + + return DinatModelOutput( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + reshaped_hidden_states=encoder_outputs.reshaped_hidden_states, + ) + + +@add_start_docstrings( + """ + Dinat Model transformer with an image classification head on top (a linear layer on top of the final hidden state + of the [CLS] token) e.g. for ImageNet. + """, + DINAT_START_DOCSTRING, +) +class DinatForImageClassification(DinatPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + requires_backends(self, ["natten"]) + + self.num_labels = config.num_labels + self.dinat = DinatModel(config) + + # Classifier head + self.classifier = ( + nn.Linear(self.dinat.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity() + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(DINAT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + processor_class=_FEAT_EXTRACTOR_FOR_DOC, + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=DinatImageClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, DinatImageClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.dinat( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return DinatImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + reshaped_hidden_states=outputs.reshaped_hidden_states, + ) diff --git a/src/transformers/models/nat/__init__.py b/src/transformers/models/nat/__init__.py new file mode 100644 index 00000000000..c5cc25af9f8 --- /dev/null +++ b/src/transformers/models/nat/__init__.py @@ -0,0 +1,60 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all. + +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +# rely on isort to merge the imports +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available + + +_import_structure = {"configuration_nat": ["NAT_PRETRAINED_CONFIG_ARCHIVE_MAP", "NatConfig"]} + + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_nat"] = [ + "NAT_PRETRAINED_MODEL_ARCHIVE_LIST", + "NatForImageClassification", + "NatModel", + "NatPreTrainedModel", + ] + +if TYPE_CHECKING: + from .configuration_nat import NAT_PRETRAINED_CONFIG_ARCHIVE_MAP, NatConfig + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_nat import ( + NAT_PRETRAINED_MODEL_ARCHIVE_LIST, + NatForImageClassification, + NatModel, + NatPreTrainedModel, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/nat/configuration_nat.py b/src/transformers/models/nat/configuration_nat.py new file mode 100644 index 00000000000..cde88bd160d --- /dev/null +++ b/src/transformers/models/nat/configuration_nat.py @@ -0,0 +1,132 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Neighborhood Attention Transformer model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + +NAT_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "shi-labs/nat-mini-in1k-224": "https://huggingface.co/shi-labs/nat-mini-in1k-224/resolve/main/config.json", + # See all Nat models at https://huggingface.co/models?filter=nat +} + + +class NatConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`NatModel`]. It is used to instantiate a Nat model + according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Nat + [shi-labs/nat-mini-in1k-224](https://huggingface.co/shi-labs/nat-mini-in1k-224) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + patch_size (`int`, *optional*, defaults to 4): + The size (resolution) of each patch. NOTE: Only patch size of 4 is supported at the moment. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + embed_dim (`int`, *optional*, defaults to 64): + Dimensionality of patch embedding. + depths (`List[int]`, *optional*, defaults to `[2, 2, 6, 2]`): + Number of layers in each level of the encoder. + num_heads (`List[int]`, *optional*, defaults to `[3, 6, 12, 24]`): + Number of attention heads in each layer of the Transformer encoder. + kernel_size (`int`, *optional*, defaults to 7): + Neighborhood Attention kernel size. + mlp_ratio (`float`, *optional*, defaults to 3.0): + Ratio of MLP hidden dimensionality to embedding dimensionality. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether or not a learnable bias should be added to the queries, keys and values. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings and encoder. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + drop_path_rate (`float`, *optional*, defaults to 0.1): + Stochastic depth rate. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`, + `"selu"` and `"gelu_new"` are supported. + patch_norm (`bool`, *optional*, defaults to `True`): + Whether or not to add layer normalization after patch embedding. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + + Example: + + ```python + >>> from transformers import NatConfig, NatModel + + >>> # Initializing a Nat shi-labs/nat-mini-in1k-224 style configuration + >>> configuration = NatConfig() + + >>> # Initializing a model (with random weights) from the shi-labs/nat-mini-in1k-224 style configuration + >>> model = NatModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "nat" + + attribute_map = { + "num_attention_heads": "num_heads", + "num_hidden_layers": "num_layers", + } + + def __init__( + self, + patch_size=4, + num_channels=3, + embed_dim=64, + depths=[3, 4, 6, 5], + num_heads=[2, 4, 8, 16], + kernel_size=7, + mlp_ratio=3.0, + qkv_bias=True, + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + drop_path_rate=0.1, + hidden_act="gelu", + patch_norm=True, + initializer_range=0.02, + layer_norm_eps=1e-5, + **kwargs + ): + super().__init__(**kwargs) + + self.patch_size = patch_size + self.num_channels = num_channels + self.embed_dim = embed_dim + self.depths = depths + self.num_layers = len(depths) + self.num_heads = num_heads + self.kernel_size = kernel_size + self.mlp_ratio = mlp_ratio + self.qkv_bias = qkv_bias + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.drop_path_rate = drop_path_rate + self.hidden_act = hidden_act + self.path_norm = patch_norm + self.layer_norm_eps = layer_norm_eps + self.initializer_range = initializer_range + # we set the hidden_size attribute in order to make Nat work with VisionEncoderDecoderModel + # this indicates the channel dimension after the last stage of the model + self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1)) diff --git a/src/transformers/models/nat/modeling_nat.py b/src/transformers/models/nat/modeling_nat.py new file mode 100644 index 00000000000..f5216f55e14 --- /dev/null +++ b/src/transformers/models/nat/modeling_nat.py @@ -0,0 +1,839 @@ +# coding=utf-8 +# Copyright 2022 SHI Labs and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Neighborhood Attention Transformer model.""" + + +import math +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + ModelOutput, + OptionalDependencyNotAvailable, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_natten_available, + logging, + requires_backends, +) +from .configuration_nat import NatConfig + + +if is_natten_available(): + from natten.functional import natten2dav, natten2dqkrpb +else: + + def natten2dqkrpb(*args, **kwargs): + raise OptionalDependencyNotAvailable() + + def natten2dav(*args, **kwargs): + raise OptionalDependencyNotAvailable() + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "NatConfig" +_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor" + +# Base docstring +_CHECKPOINT_FOR_DOC = "shi-labs/nat-mini-in1k-224" +_EXPECTED_OUTPUT_SHAPE = [1, 7, 7, 512] + +# Image classification docstring +_IMAGE_CLASS_CHECKPOINT = "shi-labs/nat-mini-in1k-224" +_IMAGE_CLASS_EXPECTED_OUTPUT = "tiger cat" + + +NAT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "shi-labs/nat-mini-in1k-224", + # See all Nat models at https://huggingface.co/models?filter=nat +] + +# drop_path and NatDropPath are from the timm library. + + +@dataclass +class NatEncoderOutput(ModelOutput): + """ + Nat encoder's outputs, with potential hidden states and attentions. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, hidden_size, height, width)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + last_hidden_state: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class NatModelOutput(ModelOutput): + """ + Nat model's outputs that also contains a pooling of the last hidden states. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed): + Average pooling of the last layer hidden-state. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, hidden_size, height, width)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + last_hidden_state: torch.FloatTensor = None + pooler_output: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class NatImageClassifierOutput(ModelOutput): + """ + Nat outputs for image classification. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, hidden_size, height, width)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + + +class NatEmbeddings(nn.Module): + """ + Construct the patch and position embeddings. + """ + + def __init__(self, config): + super().__init__() + + self.patch_embeddings = NatPatchEmbeddings(config) + + self.norm = nn.LayerNorm(config.embed_dim) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor]: + embeddings = self.patch_embeddings(pixel_values) + embeddings = self.norm(embeddings) + + embeddings = self.dropout(embeddings) + + return embeddings + + +class NatPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, height, width, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + patch_size = config.patch_size + num_channels, hidden_size = config.num_channels, config.embed_dim + self.num_channels = num_channels + + if patch_size == 4: + pass + else: + # TODO: Support arbitrary patch sizes. + raise ValueError("Dinat only supports patch size of 4 at the moment.") + + self.projection = nn.Sequential( + nn.Conv2d(self.num_channels, hidden_size // 2, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)), + nn.Conv2d(hidden_size // 2, hidden_size, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)), + ) + + def forward(self, pixel_values: Optional[torch.FloatTensor]) -> torch.Tensor: + _, num_channels, height, width = pixel_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + embeddings = self.projection(pixel_values) + embeddings = embeddings.permute(0, 2, 3, 1) + + return embeddings + + +class NatDownsampler(nn.Module): + """ + Convolutional Downsampling Layer. + + Args: + dim (`int`): + Number of input channels. + norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`): + Normalization layer class. + """ + + def __init__(self, dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None: + super().__init__() + self.dim = dim + self.reduction = nn.Conv2d(dim, 2 * dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) + self.norm = norm_layer(2 * dim) + + def forward(self, input_feature: torch.Tensor) -> torch.Tensor: + input_feature = self.reduction(input_feature.permute(0, 3, 1, 2)).permute(0, 2, 3, 1) + input_feature = self.norm(input_feature) + return input_feature + + +# Copied from transformers.models.beit.modeling_beit.drop_path +def drop_path(input, drop_prob=0.0, training=False, scale_by_keep=True): + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Nat +class NatDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return drop_path(x, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return "p={}".format(self.drop_prob) + + +class NeighborhoodAttention(nn.Module): + def __init__(self, config, dim, num_heads, kernel_size): + super().__init__() + if dim % num_heads != 0: + raise ValueError( + f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})" + ) + + self.num_attention_heads = num_heads + self.attention_head_size = int(dim / num_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.kernel_size = kernel_size + + # rpb is learnable relative positional biases; same concept is used Swin. + self.rpb = nn.Parameter(torch.zeros(num_heads, (2 * self.kernel_size - 1), (2 * self.kernel_size - 1))) + + self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 3, 1, 2, 4) + + def forward( + self, + hidden_states: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + + query_layer = self.transpose_for_scores(self.query(hidden_states)) + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + # Apply the scale factor before computing attention weights. It's usually more efficient because + # attention weights are typically a bigger tensor compared to query. + # It gives identical results because scalars are commutable in matrix multiplication. + query_layer = query_layer / math.sqrt(self.attention_head_size) + + # Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases. + attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, 1) + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + context_layer = natten2dav(attention_probs, value_layer, 1) + context_layer = context_layer.permute(0, 2, 3, 1, 4).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class NeighborhoodAttentionOutput(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(dim, dim) + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +class NeighborhoodAttentionModule(nn.Module): + def __init__(self, config, dim, num_heads, kernel_size): + super().__init__() + self.self = NeighborhoodAttention(config, dim, num_heads, kernel_size) + self.output = NeighborhoodAttentionOutput(config, dim) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.self(hidden_states, output_attentions) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class NatIntermediate(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(dim, int(config.mlp_ratio * dim)) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class NatOutput(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(int(config.mlp_ratio * dim), dim) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +class NatLayer(nn.Module): + def __init__(self, config, dim, num_heads, drop_path_rate=0.0): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.kernel_size = config.kernel_size + self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) + self.attention = NeighborhoodAttentionModule(config, dim, num_heads, kernel_size=self.kernel_size) + self.drop_path = NatDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() + self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps) + self.intermediate = NatIntermediate(config, dim) + self.output = NatOutput(config, dim) + + def maybe_pad(self, hidden_states, height, width): + window_size = self.kernel_size + pad_values = (0, 0, 0, 0, 0, 0) + if height < window_size or width < window_size: + pad_l = pad_t = 0 + pad_r = max(0, window_size - width) + pad_b = max(0, window_size - height) + pad_values = (0, 0, pad_l, pad_r, pad_t, pad_b) + hidden_states = nn.functional.pad(hidden_states, pad_values) + return hidden_states, pad_values + + def forward( + self, + hidden_states: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, torch.Tensor]: + batch_size, height, width, channels = hidden_states.size() + shortcut = hidden_states + + hidden_states = self.layernorm_before(hidden_states) + # pad hidden_states if they are smaller than kernel size + hidden_states, pad_values = self.maybe_pad(hidden_states, height, width) + + _, height_pad, width_pad, _ = hidden_states.shape + + attention_outputs = self.attention(hidden_states, output_attentions=output_attentions) + + attention_output = attention_outputs[0] + + was_padded = pad_values[3] > 0 or pad_values[5] > 0 + if was_padded: + attention_output = attention_output[:, :height, :width, :].contiguous() + + hidden_states = shortcut + self.drop_path(attention_output) + + layer_output = self.layernorm_after(hidden_states) + layer_output = self.intermediate(layer_output) + layer_output = hidden_states + self.output(layer_output) + + layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,) + return layer_outputs + + +class NatStage(nn.Module): + def __init__(self, config, dim, depth, num_heads, drop_path_rate, downsample): + super().__init__() + self.config = config + self.dim = dim + self.layers = nn.ModuleList( + [ + NatLayer( + config=config, + dim=dim, + num_heads=num_heads, + drop_path_rate=drop_path_rate[i], + ) + for i in range(depth) + ] + ) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(dim=dim, norm_layer=nn.LayerNorm) + else: + self.downsample = None + + self.pointing = False + + def forward( + self, + hidden_states: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + _, height, width, _ = hidden_states.size() + for i, layer_module in enumerate(self.layers): + layer_outputs = layer_module(hidden_states, output_attentions) + hidden_states = layer_outputs[0] + + if self.downsample is not None: + height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2 + output_dimensions = (height, width, height_downsampled, width_downsampled) + hidden_states = self.downsample(layer_outputs[0]) + else: + output_dimensions = (height, width, height, width) + + stage_outputs = (hidden_states, output_dimensions) + + if output_attentions: + stage_outputs += layer_outputs[1:] + return stage_outputs + + +class NatEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.num_levels = len(config.depths) + self.config = config + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))] + self.levels = nn.ModuleList( + [ + NatStage( + config=config, + dim=int(config.embed_dim * 2**i_layer), + depth=config.depths[i_layer], + num_heads=config.num_heads[i_layer], + drop_path_rate=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])], + downsample=NatDownsampler if (i_layer < self.num_levels - 1) else None, + ) + for i_layer in range(self.num_levels) + ] + ) + + def forward( + self, + hidden_states: torch.Tensor, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[Tuple, NatEncoderOutput]: + all_hidden_states = () if output_hidden_states else None + all_reshaped_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if output_hidden_states: + # rearrange b h w c -> b c h w + reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2) + all_hidden_states += (hidden_states,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + + for i, layer_module in enumerate(self.levels): + layer_outputs = layer_module(hidden_states, output_attentions) + + hidden_states = layer_outputs[0] + + if output_hidden_states: + # rearrange b h w c -> b c h w + reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2) + all_hidden_states += (hidden_states,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + + if output_attentions: + all_self_attentions += layer_outputs[2:] + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + + return NatEncoderOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + reshaped_hidden_states=all_reshaped_hidden_states, + ) + + +class NatPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = NatConfig + base_model_prefix = "nat" + main_input_name = "pixel_values" + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def _set_gradient_checkpointing(self, module: NatEncoder, value: bool = False) -> None: + pass + + +NAT_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use + it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`NatConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +NAT_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See + [`AutoImageProcessor.__call__`] for details. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Nat Model transformer outputting raw hidden-states without any specific head on top.", + NAT_START_DOCSTRING, +) +class NatModel(NatPreTrainedModel): + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + + requires_backends(self, ["natten"]) + + self.config = config + self.num_levels = len(config.depths) + self.num_features = int(config.embed_dim * 2 ** (self.num_levels - 1)) + + self.embeddings = NatEmbeddings(config) + self.encoder = NatEncoder(config) + + self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps) + self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.patch_embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + processor_class=_FEAT_EXTRACTOR_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=NatModelOutput, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, NatModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + embedding_output = self.embeddings(pixel_values) + + encoder_outputs = self.encoder( + embedding_output, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output) + + pooled_output = None + if self.pooler is not None: + pooled_output = self.pooler(sequence_output.flatten(1, 2).transpose(1, 2)) + pooled_output = torch.flatten(pooled_output, 1) + + if not return_dict: + output = (sequence_output, pooled_output) + encoder_outputs[1:] + + return output + + return NatModelOutput( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + reshaped_hidden_states=encoder_outputs.reshaped_hidden_states, + ) + + +@add_start_docstrings( + """ + Nat Model transformer with an image classification head on top (a linear layer on top of the final hidden state of + the [CLS] token) e.g. for ImageNet. + """, + NAT_START_DOCSTRING, +) +class NatForImageClassification(NatPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + requires_backends(self, ["natten"]) + + self.num_labels = config.num_labels + self.nat = NatModel(config) + + # Classifier head + self.classifier = ( + nn.Linear(self.nat.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity() + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + processor_class=_FEAT_EXTRACTOR_FOR_DOC, + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=NatImageClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, NatImageClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.nat( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return NatImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + reshaped_hidden_states=outputs.reshaped_hidden_states, + ) diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index eba7062840f..76e866d81ae 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -58,6 +58,7 @@ from .utils import ( is_ipex_available, is_jumanpp_available, is_librosa_available, + is_natten_available, is_onnx_available, is_pandas_available, is_phonemizer_available, @@ -282,6 +283,16 @@ def require_timm(test_case): return unittest.skipUnless(is_timm_available(), "test requires Timm")(test_case) +def require_natten(test_case): + """ + Decorator marking a test that requires NATTEN. + + These tests are skipped when NATTEN isn't installed. + + """ + return unittest.skipUnless(is_natten_available(), "test requires natten")(test_case) + + def require_torch(test_case): """ Decorator marking a test that requires PyTorch. diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index 8e2d62a04cd..15154e2ad6b 100644 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -113,6 +113,7 @@ from .import_utils import ( is_kenlm_available, is_librosa_available, is_more_itertools_available, + is_natten_available, is_ninja_available, is_onnx_available, is_pandas_available, diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index c55c7d6995a..14e65b68675 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -1740,6 +1740,30 @@ class DeiTPreTrainedModel(metaclass=DummyObject): requires_backends(self, ["torch"]) +DINAT_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class DinatForImageClassification(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class DinatModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class DinatPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None @@ -3738,6 +3762,30 @@ class MvpPreTrainedModel(metaclass=DummyObject): requires_backends(self, ["torch"]) +NAT_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class NatForImageClassification(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class NatModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class NatPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + NEZHA_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index 474b204170f..81faa6ea853 100644 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -218,6 +218,14 @@ except importlib_metadata.PackageNotFoundError: _timm_available = False +_natten_available = importlib.util.find_spec("natten") is not None +try: + _natten_version = importlib_metadata.version("natten") + logger.debug(f"Successfully imported natten version {_natten_version}") +except importlib_metadata.PackageNotFoundError: + _natten_available = False + + _torchaudio_available = importlib.util.find_spec("torchaudio") is not None try: _torchaudio_version = importlib_metadata.version("torchaudio") @@ -644,6 +652,10 @@ def is_timm_available(): return _timm_available +def is_natten_available(): + return _natten_available + + def is_torchaudio_available(): return _torchaudio_available @@ -882,6 +894,13 @@ TIMM_IMPORT_ERROR = """ `pip install timm`. Please note that you may need to restart your runtime after installation. """ +# docstyle-ignore +NATTEN_IMPORT_ERROR = """ +{0} requires the natten library but it was not found in your environment. You can install it by referring to: +shi-labs.com/natten . You can also install it with pip (may take longer to build): +`pip install natten`. Please note that you may need to restart your runtime after installation. +""" + # docstyle-ignore VISION_IMPORT_ERROR = """ {0} requires the PIL library but it was not found in your environment. You can install it with pip: @@ -936,6 +955,7 @@ BACKENDS_MAPPING = OrderedDict( ("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)), ("tensorflow_text", (is_tensorflow_text_available, TENSORFLOW_TEXT_IMPORT_ERROR)), ("timm", (is_timm_available, TIMM_IMPORT_ERROR)), + ("natten", (is_natten_available, NATTEN_IMPORT_ERROR)), ("tokenizers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)), ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)), ("vision", (is_vision_available, VISION_IMPORT_ERROR)), diff --git a/tests/models/dinat/__init__.py b/tests/models/dinat/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/models/dinat/test_modeling_dinat.py b/tests/models/dinat/test_modeling_dinat.py new file mode 100644 index 00000000000..ed0356f6ad6 --- /dev/null +++ b/tests/models/dinat/test_modeling_dinat.py @@ -0,0 +1,336 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch Dinat model. """ + +import collections +import inspect +import unittest + +from transformers import DinatConfig +from transformers.testing_utils import require_natten, require_torch, require_vision, slow, torch_device +from transformers.utils import cached_property, is_torch_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import DinatForImageClassification, DinatModel + from transformers.models.dinat.modeling_dinat import DINAT_PRETRAINED_MODEL_ARCHIVE_LIST + +if is_vision_available(): + from PIL import Image + + from transformers import AutoImageProcessor + + +class DinatModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=64, + patch_size=4, + num_channels=3, + embed_dim=16, + depths=[1, 2, 1], + num_heads=[2, 4, 8], + kernel_size=3, + dilations=[[3], [1, 2], [1]], + mlp_ratio=2.0, + qkv_bias=True, + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + drop_path_rate=0.1, + hidden_act="gelu", + patch_norm=True, + initializer_range=0.02, + layer_norm_eps=1e-5, + is_training=True, + scope=None, + use_labels=True, + type_sequence_label_size=10, + encoder_stride=8, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.embed_dim = embed_dim + self.depths = depths + self.num_heads = num_heads + self.kernel_size = kernel_size + self.dilations = dilations + self.mlp_ratio = mlp_ratio + self.qkv_bias = qkv_bias + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.drop_path_rate = drop_path_rate + self.hidden_act = hidden_act + self.patch_norm = patch_norm + self.layer_norm_eps = layer_norm_eps + self.initializer_range = initializer_range + self.is_training = is_training + self.scope = scope + self.use_labels = use_labels + self.type_sequence_label_size = type_sequence_label_size + self.encoder_stride = encoder_stride + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return DinatConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + embed_dim=self.embed_dim, + depths=self.depths, + num_heads=self.num_heads, + kernel_size=self.kernel_size, + dilations=self.dilations, + mlp_ratio=self.mlp_ratio, + qkv_bias=self.qkv_bias, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + drop_path_rate=self.drop_path_rate, + hidden_act=self.hidden_act, + patch_norm=self.patch_norm, + layer_norm_eps=self.layer_norm_eps, + initializer_range=self.initializer_range, + encoder_stride=self.encoder_stride, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = DinatModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + expected_height = expected_width = (config.image_size // config.patch_size) // (2 ** (len(config.depths) - 1)) + expected_dim = int(config.embed_dim * 2 ** (len(config.depths) - 1)) + + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, expected_height, expected_width, expected_dim) + ) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + config.num_labels = self.type_sequence_label_size + model = DinatForImageClassification(config) + model.to(torch_device) + model.eval() + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + # test greyscale images + config.num_channels = 1 + model = DinatForImageClassification(config) + model.to(torch_device) + model.eval() + + pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size]) + result = model(pixel_values) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_natten +@require_torch +class DinatModelTest(ModelTesterMixin, unittest.TestCase): + + all_model_classes = (DinatModel, DinatForImageClassification) if is_torch_available() else () + fx_compatible = False + + test_torchscript = False + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = DinatModelTester(self) + self.config_tester = ConfigTester(self, config_class=DinatConfig, embed_dim=37) + + def test_config(self): + self.create_and_test_config_common_properties() + self.config_tester.create_and_test_config_to_json_string() + self.config_tester.create_and_test_config_to_json_file() + self.config_tester.create_and_test_config_from_and_save_pretrained() + self.config_tester.create_and_test_config_with_num_labels() + self.config_tester.check_config_can_be_init_without_params() + self.config_tester.check_config_arguments_init() + + def create_and_test_config_common_properties(self): + return + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + def test_inputs_embeds(self): + # Dinat does not use inputs_embeds + pass + + def test_model_common_attributes(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_attention_outputs(self): + self.skipTest("Dinat's attention operation is handled entirely by NATTEN.") + + def check_hidden_states_output(self, inputs_dict, config, model_class, image_size): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1 + ) + self.assertEqual(len(hidden_states), expected_num_layers) + + # Dinat has a different seq_length + patch_size = ( + config.patch_size + if isinstance(config.patch_size, collections.abc.Iterable) + else (config.patch_size, config.patch_size) + ) + + height = image_size[0] // patch_size[0] + width = image_size[1] // patch_size[1] + + self.assertListEqual( + list(hidden_states[0].shape[-3:]), + [height, width, self.model_tester.embed_dim], + ) + + reshaped_hidden_states = outputs.reshaped_hidden_states + self.assertEqual(len(reshaped_hidden_states), expected_num_layers) + + batch_size, num_channels, height, width = reshaped_hidden_states[0].shape + reshaped_hidden_states = ( + reshaped_hidden_states[0].view(batch_size, num_channels, height, width).permute(0, 2, 3, 1) + ) + self.assertListEqual( + list(reshaped_hidden_states.shape[-3:]), + [height, width, self.model_tester.embed_dim], + ) + + def test_hidden_states_output(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + image_size = ( + self.model_tester.image_size + if isinstance(self.model_tester.image_size, collections.abc.Iterable) + else (self.model_tester.image_size, self.model_tester.image_size) + ) + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + self.check_hidden_states_output(inputs_dict, config, model_class, image_size) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + self.check_hidden_states_output(inputs_dict, config, model_class, image_size) + + @slow + def test_model_from_pretrained(self): + for model_name in DINAT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = DinatModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if "embeddings" not in name and param.requires_grad: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + +@require_natten +@require_vision +@require_torch +class DinatModelIntegrationTest(unittest.TestCase): + @cached_property + def default_feature_extractor(self): + return AutoImageProcessor.from_pretrained("shi-labs/dinat-mini-in1k-224") if is_vision_available() else None + + @slow + def test_inference_image_classification_head(self): + model = DinatForImageClassification.from_pretrained("shi-labs/dinat-mini-in1k-224").to(torch_device) + feature_extractor = self.default_feature_extractor + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + expected_shape = torch.Size((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) + expected_slice = torch.tensor([-0.1545, -0.7667, 0.4642]).to(torch_device) + self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4)) diff --git a/tests/models/nat/__init__.py b/tests/models/nat/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/models/nat/test_modeling_nat.py b/tests/models/nat/test_modeling_nat.py new file mode 100644 index 00000000000..1c239c36f8b --- /dev/null +++ b/tests/models/nat/test_modeling_nat.py @@ -0,0 +1,333 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch Nat model. """ + +import collections +import inspect +import unittest + +from transformers import NatConfig +from transformers.testing_utils import require_natten, require_torch, require_vision, slow, torch_device +from transformers.utils import cached_property, is_torch_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import NatForImageClassification, NatModel + from transformers.models.nat.modeling_nat import NAT_PRETRAINED_MODEL_ARCHIVE_LIST + +if is_vision_available(): + from PIL import Image + + from transformers import AutoImageProcessor + + +class NatModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=64, + patch_size=4, + num_channels=3, + embed_dim=16, + depths=[1, 2, 1], + num_heads=[2, 4, 8], + kernel_size=3, + mlp_ratio=2.0, + qkv_bias=True, + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + drop_path_rate=0.1, + hidden_act="gelu", + patch_norm=True, + initializer_range=0.02, + layer_norm_eps=1e-5, + is_training=True, + scope=None, + use_labels=True, + type_sequence_label_size=10, + encoder_stride=8, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.embed_dim = embed_dim + self.depths = depths + self.num_heads = num_heads + self.kernel_size = kernel_size + self.mlp_ratio = mlp_ratio + self.qkv_bias = qkv_bias + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.drop_path_rate = drop_path_rate + self.hidden_act = hidden_act + self.patch_norm = patch_norm + self.layer_norm_eps = layer_norm_eps + self.initializer_range = initializer_range + self.is_training = is_training + self.scope = scope + self.use_labels = use_labels + self.type_sequence_label_size = type_sequence_label_size + self.encoder_stride = encoder_stride + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return NatConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + embed_dim=self.embed_dim, + depths=self.depths, + num_heads=self.num_heads, + kernel_size=self.kernel_size, + mlp_ratio=self.mlp_ratio, + qkv_bias=self.qkv_bias, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + drop_path_rate=self.drop_path_rate, + hidden_act=self.hidden_act, + patch_norm=self.patch_norm, + layer_norm_eps=self.layer_norm_eps, + initializer_range=self.initializer_range, + encoder_stride=self.encoder_stride, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = NatModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + expected_height = expected_width = (config.image_size // config.patch_size) // (2 ** (len(config.depths) - 1)) + expected_dim = int(config.embed_dim * 2 ** (len(config.depths) - 1)) + + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, expected_height, expected_width, expected_dim) + ) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + config.num_labels = self.type_sequence_label_size + model = NatForImageClassification(config) + model.to(torch_device) + model.eval() + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + # test greyscale images + config.num_channels = 1 + model = NatForImageClassification(config) + model.to(torch_device) + model.eval() + + pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size]) + result = model(pixel_values) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_natten +@require_torch +class NatModelTest(ModelTesterMixin, unittest.TestCase): + + all_model_classes = (NatModel, NatForImageClassification) if is_torch_available() else () + fx_compatible = False + + test_torchscript = False + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = NatModelTester(self) + self.config_tester = ConfigTester(self, config_class=NatConfig, embed_dim=37) + + def test_config(self): + self.create_and_test_config_common_properties() + self.config_tester.create_and_test_config_to_json_string() + self.config_tester.create_and_test_config_to_json_file() + self.config_tester.create_and_test_config_from_and_save_pretrained() + self.config_tester.create_and_test_config_with_num_labels() + self.config_tester.check_config_can_be_init_without_params() + self.config_tester.check_config_arguments_init() + + def create_and_test_config_common_properties(self): + return + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + def test_inputs_embeds(self): + # Nat does not use inputs_embeds + pass + + def test_model_common_attributes(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_attention_outputs(self): + self.skipTest("Nat's attention operation is handled entirely by NATTEN.") + + def check_hidden_states_output(self, inputs_dict, config, model_class, image_size): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1 + ) + self.assertEqual(len(hidden_states), expected_num_layers) + + # Nat has a different seq_length + patch_size = ( + config.patch_size + if isinstance(config.patch_size, collections.abc.Iterable) + else (config.patch_size, config.patch_size) + ) + + height = image_size[0] // patch_size[0] + width = image_size[1] // patch_size[1] + + self.assertListEqual( + list(hidden_states[0].shape[-3:]), + [height, width, self.model_tester.embed_dim], + ) + + reshaped_hidden_states = outputs.reshaped_hidden_states + self.assertEqual(len(reshaped_hidden_states), expected_num_layers) + + batch_size, num_channels, height, width = reshaped_hidden_states[0].shape + reshaped_hidden_states = ( + reshaped_hidden_states[0].view(batch_size, num_channels, height, width).permute(0, 2, 3, 1) + ) + self.assertListEqual( + list(reshaped_hidden_states.shape[-3:]), + [height, width, self.model_tester.embed_dim], + ) + + def test_hidden_states_output(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + image_size = ( + self.model_tester.image_size + if isinstance(self.model_tester.image_size, collections.abc.Iterable) + else (self.model_tester.image_size, self.model_tester.image_size) + ) + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + self.check_hidden_states_output(inputs_dict, config, model_class, image_size) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + self.check_hidden_states_output(inputs_dict, config, model_class, image_size) + + @slow + def test_model_from_pretrained(self): + for model_name in NAT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = NatModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if "embeddings" not in name and param.requires_grad: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + +@require_natten +@require_vision +@require_torch +class NatModelIntegrationTest(unittest.TestCase): + @cached_property + def default_feature_extractor(self): + return AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224") if is_vision_available() else None + + @slow + def test_inference_image_classification_head(self): + model = NatForImageClassification.from_pretrained("shi-labs/nat-mini-in1k-224").to(torch_device) + feature_extractor = self.default_feature_extractor + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + expected_shape = torch.Size((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) + expected_slice = torch.tensor([0.3805, -0.8676, -0.3912]).to(torch_device) + self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4)) diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt index 4bcfea5d71e..b7b63e3602b 100644 --- a/utils/documentation_tests.txt +++ b/utils/documentation_tests.txt @@ -66,6 +66,8 @@ src/transformers/models/deit/modeling_deit.py src/transformers/models/deit/modeling_tf_deit.py src/transformers/models/detr/configuration_detr.py src/transformers/models/detr/modeling_detr.py +src/transformers/models/dinat/configuration_dinat.py +src/transformers/models/dinat/modeling_dinat.py src/transformers/models/distilbert/configuration_distilbert.py src/transformers/models/dpr/configuration_dpr.py src/transformers/models/dpt/modeling_dpt.py @@ -113,6 +115,8 @@ src/transformers/models/mobilebert/modeling_tf_mobilebert.py src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py src/transformers/models/mobilevit/modeling_mobilevit.py src/transformers/models/mobilevit/modeling_tf_mobilevit.py +src/transformers/models/nat/configuration_nat.py +src/transformers/models/nat/modeling_nat.py src/transformers/models/nezha/configuration_nezha.py src/transformers/models/openai/configuration_openai.py src/transformers/models/opt/configuration_opt.py