Adding SegGPT (#27735)

* First commit

* Improvements

* More improvements

* Converted original checkpoint to HF checkpoint

* Fix style

* Fixed forward

* More improvements

* More improvements

* Update src/transformers/models/seggpt/modeling_seggpt.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Remove asserts

* Remove unnecessary attributes

* Changed model name to camel case

* Improve forward doc

* Improve tests

* More improvements

* Fix copies

* Fix doc

* Make SegGptImageProcessor more flexible

* Added few-shot test

* Fix style

* Update READMEs and docs

* Update READMEs

* Make inputs required

* Add SegGptForImageSegmentation

* Make tests pass

* Rename to out_indicies

* Update src/transformers/models/seggpt/image_processing_seggpt.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/seggpt/image_processing_seggpt.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Fixed naming convention

* Copying SegGptMlp from modeling_sam.py

* Some minor improvements

* Remove mlp_ratio

* Fix docstrings

* Fixed docstring match

* Objects defined before use

* Storing only patch_size and beta for SegGptLoss

* removed _prepare_inputs method

* Removed modified from headers

* Renamed to output_indicies

* Removed unnecessary einsums

* Update tests/models/seggpt/test_modeling_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/seggpt/test_modeling_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/seggpt/test_modeling_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/seggpt/image_processing_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/seggpt/image_processing_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/seggpt/image_processing_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/seggpt/modeling_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/seggpt/modeling_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Fixing issues

* Raise error as soon as possible

* More fixes

* Fix merge

* Added palette to SegGptImageProcessor

* Fixed typo

* Fixed shape typo

* Added permute before doing palette to class mapping

* Fixed style

* Fixed and added tests

* Fixed docstrings

* Matching SegFormer API for post_processing_semantic_segmentation

* Fixed copies

* Fixed SegGptImageProcessor to handle both binary and RGB masks

* Updated docstrings of SegGptImageProcessor

* Update src/transformers/models/seggpt/image_processing_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update docs/source/en/model_doc/seggpt.md

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/seggpt/configuration_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/seggpt/convert_seggpt_to_hf.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/seggpt/image_processing_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/seggpt/modeling_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/seggpt/image_processing_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/seggpt/image_processing_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/seggpt/image_processing_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/seggpt/modeling_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/seggpt/test_image_processing_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/seggpt/test_modeling_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/seggpt/modeling_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/seggpt/modeling_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/seggpt/modeling_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Object definitions above & fix style

* Renamed output_indices to intermediate_feature_indices

* Removed unnecessary check on bool_masked_pos

* Loss first in the outputs

* Added validation for do_normalize

* Improved SegGptImageProcessor and added new tests

* Added comment

* Added docstrings to SegGptLoss

* Reimplemented ensemble condition logic in SegGptEncoder

* Update src/transformers/models/seggpt/__init__.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/seggpt/modeling_seggpt.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/seggpt/convert_seggpt_to_hf.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/seggpt/configuration_seggpt.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Updated docstrings to use post_process_semantic_segmentation

* Fixed typo on docstrings

* moved pixel values test to test_image_processing_seggpt

* Addressed comments

* Update src/transformers/models/seggpt/configuration_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/seggpt/image_processing_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/seggpt/configuration_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/seggpt/modeling_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Updated docstrings for SegGptLoss

* Address comments

* Added SegGpt example to model docs

* Update src/transformers/models/seggpt/modeling_seggpt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* moved patchify and unpatchify

* Rename checkpoint

* Renamed intermediate_features to intermediate_hidden_states for consistency

* Update src/transformers/models/seggpt/configuration_seggpt.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Replaced post_process_masks for post_process_semantic_segmentation in the docs

---------

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Co-authored-by: Niels <niels.rogge1@gmail.com>
Co-authored-by: Eduardo Pacheco <eduardo.pacheco@limehome.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
Eduardo Pacheco 2024-02-26 19:17:19 +01:00 committed by GitHub
parent 3b8c053631
commit 3fcfbe7549
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
28 changed files with 2816 additions and 4 deletions

View File

@ -482,6 +482,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
1. **[SegGPT](https://huggingface.co/docs/transformers/main/model_doc/seggpt)** (from Beijing Academy of Artificial Intelligence (BAAI)) released with the paper [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) by Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang.
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.

View File

@ -455,6 +455,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
1. **[SegGPT](https://huggingface.co/docs/transformers/main/model_doc/seggpt)** (from Beijing Academy of Artificial Intelligence (BAAI) released with the paper [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) by Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang.
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.

View File

@ -476,6 +476,7 @@ Nombre actuel de points de contrôle : ![](https://img.shields.io/endpoint?url=h
1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (de Meta AI) a été publié dans l'article [SeamlessM4T — Traduction multimodale et massivement multilingue](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) par l'équipe de communication transparente.
1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/model_doc/seamless_m4t_v2)** (de Meta AI) a été publié dans l'article [Seamless: Traduction de la parole multilingue, expressive et en continu](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) par l'équipe de communication transparente.
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (de NVIDIA) a été publié dans l'article [SegFormer : Conception simple et efficace pour la segmentation sémantique avec des transformateurs](https://arxiv.org/abs/2105.15203) par Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
1. **[SegGPT](https://huggingface.co/docs/transformers/main/model_doc/seggpt)** (de Beijing Academy of Artificial Intelligence (BAAI) publié dans l'article [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) parXinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang.
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (de Meta AI) a été publié dans l'article [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) par Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (de ASAPP) a été publié dans l'article [Compromis entre performances et efficacité dans l'entraînement non supervisé pour la reconnaissance vocale](https://arxiv.org/abs/2109.06870) par Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (de ASAPP) a été publié dans l'article [Compromis entre performances et efficacité dans l'entraînement non supervisé pour la reconnaissance vocale](https://arxiv.org/abs/2109.06870) par Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.

View File

@ -429,6 +429,7 @@ conda install conda-forge::transformers
1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
1. **[SegGPT](https://huggingface.co/docs/transformers/main/model_doc/seggpt)** (Beijing Academy of Artificial Intelligence (BAAI से) Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang. द्वाराअनुसंधान पत्र [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) के साथ जारी किया गया
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI से) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. द्वाराअनुसंधान पत्र [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) के साथ जारी किया गया
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP से) साथ देने वाला पेपर [भाषण पहचान के लिए अनसुपरवाइज्ड प्री-ट्रेनिंग में परफॉर्मेंस-एफिशिएंसी ट्रेड-ऑफ्स](https://arxiv.org/abs/2109.06870) फेलिक्स वू, क्वांगयुन किम, जिंग पैन, क्यू हान, किलियन क्यू. वेनबर्गर, योव आर्टज़ी द्वारा।
1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP से) साथ में पेपर [भाषण पहचान के लिए अनसुपरवाइज्ड प्री-ट्रेनिंग में परफॉर्मेंस-एफिशिएंसी ट्रेड-ऑफ्स](https://arxiv.org/abs/2109.06870) फेलिक्स वू, क्वांगयुन किम, जिंग पैन, क्यू हान, किलियन क्यू. वेनबर्गर, योआव आर्टज़ी द्वारा पोस्ट किया गया।

View File

@ -489,6 +489,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA から) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo から公開された研究論文: [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203)
1. **[SegGPT](https://huggingface.co/docs/transformers/main/model_doc/seggpt)** (Beijing Academy of Artificial Intelligence (BAAI から) Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang. から公開された研究論文 [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284)
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI から) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. から公開された研究論文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP から) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi から公開された研究論文: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870)
1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP から) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi から公開された研究論文: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870)

View File

@ -404,6 +404,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA 에서) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 의 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 논문과 함께 발표했습니다.
1. **[SegGPT](https://huggingface.co/docs/transformers/main/model_doc/seggpt)** (Beijing Academy of Artificial Intelligence (BAAI 에서 제공)은 Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang.의 [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284)논문과 함께 발표했습니다.
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI 에서 제공)은 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.의 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)논문과 함께 발표했습니다.
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP 에서) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 의 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 논문과 함께 발표했습니다.
1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP 에서) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 의 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 논문과 함께 발표했습니다.

View File

@ -428,6 +428,7 @@ conda install conda-forge::transformers
1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (来自 NVIDIA) 伴随论文 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 由 Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 发布。
1. **[SegGPT](https://huggingface.co/docs/transformers/main/model_doc/seggpt)** (来自 Beijing Academy of Artificial Intelligence (BAAI) 伴随论文 [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) 由 Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang 发布。
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (来自 Meta AI) 伴随论文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) 由 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick 发布。
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (来自 ASAPP) 伴随论文 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 由 Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 发布。
1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (来自 ASAPP) 伴随论文 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 由 Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 发布。

View File

@ -440,6 +440,7 @@ conda install conda-forge::transformers
1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
1. **[SegGPT](https://huggingface.co/docs/transformers/main/model_doc/seggpt)** (from Beijing Academy of Artificial Intelligence (BAAI) released with the paper [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) by Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang.
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.

View File

@ -583,6 +583,8 @@
title: ResNet
- local: model_doc/segformer
title: SegFormer
- local: model_doc/seggpt
title: SegGpt
- local: model_doc/swiftformer
title: SwiftFormer
- local: model_doc/swin

View File

@ -251,6 +251,7 @@ Flax), PyTorch, and/or TensorFlow.
| [SeamlessM4T](model_doc/seamless_m4t) | ✅ | ❌ | ❌ |
| [SeamlessM4Tv2](model_doc/seamless_m4t_v2) | ✅ | ❌ | ❌ |
| [SegFormer](model_doc/segformer) | ✅ | ✅ | ❌ |
| [SegGPT](model_doc/seggpt) | ✅ | ❌ | ❌ |
| [SEW](model_doc/sew) | ✅ | ❌ | ❌ |
| [SEW-D](model_doc/sew-d) | ✅ | ❌ | ❌ |
| [SigLIP](model_doc/siglip) | ✅ | ❌ | ❌ |

View File

@ -0,0 +1,90 @@
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# SegGPT
## Overview
The SegGPT model was proposed in [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) by Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang. SegGPT employs a decoder-only Transformer that can generate a segmentation mask given an input image, a prompt image and its corresponding prompt mask. The model achieves remarkable one-shot results with 56.1 mIoU on COCO-20 and 85.6 mIoU on FSS-1000.
The abstract from the paper is the following:
*We present SegGPT, a generalist model for segmenting everything in context. We unify various segmentation tasks into a generalist in-context learning framework that accommodates different kinds of segmentation data by transforming them into the same format of images. The training of SegGPT is formulated as an in-context coloring problem with random color mapping for each data sample. The objective is to accomplish diverse tasks according to the context, rather than relying on specific colors. After training, SegGPT can perform arbitrary segmentation tasks in images or videos via in-context inference, such as object instance, stuff, part, contour, and text. SegGPT is evaluated on a broad range of tasks, including few-shot semantic segmentation, video object segmentation, semantic segmentation, and panoptic segmentation. Our results show strong capabilities in segmenting in-domain and out-of*
Tips:
- One can use [`SegGptImageProcessor`] to prepare image input, prompt and mask to the model.
- It's highly advisable to pass `num_labels` (not considering background) during preprocessing and postprocessing with [`SegGptImageProcessor`] for your use case.
- When doing infenrece with [`SegGptForImageSegmentation`] if your `batch_size` is greater than 1 you can use feature ensemble across your images by passing `feature_ensemble=True` in the forward method.
Here's how to use the model for one-shot semantic segmentation:
```python
import torch
from datasets import load_dataset
from transformers import SegGptImageProcessor, SegGptForImageSegmentation
model_id = "BAAI/seggpt-vit-large"
image_processor = SegGptImageProcessor.from_pretrained(checkpoint)
model = SegGptForImageSegmentation.from_pretrained(checkpoint)
dataset_id = "EduardoPacheco/FoodSeg103"
ds = load_dataset(dataset_id, split="train")
# Number of labels in FoodSeg103 (not including background)
num_labels = 103
image_input = ds[4]["image"]
ground_truth = ds[4]["label"]
image_prompt = ds[29]["image"]
mask_prompt = ds[29]["label"]
inputs = image_processor(
images=image_input,
prompt_images=image_prompt,
prompt_masks=mask_prompt,
num_labels=num_labels,
return_tensors="pt"
)
with torch.no_grad():
outputs = model(**inputs)
target_sizes = [image_input.size[::-1]]
mask = image_processor.post_process_semantic_segmentation(outputs, target_sizes, num_labels=num_labels)[0]
```
This model was contributed by [EduardoPacheco](https://huggingface.co/EduardoPacheco).
The original code can be found [here]([(https://github.com/baaivision/Painter/tree/main)).
## SegGptConfig
[[autodoc]] SegGptConfig
## SegGptImageProcessor
[[autodoc]] SegGptImageProcessor
- preprocess
- post_process_semantic_segmentation
## SegGptModel
[[autodoc]] SegGptModel
- forward
## SegGptForImageSegmentation
[[autodoc]] SegGptForImageSegmentation
- forward

View File

@ -767,6 +767,7 @@ _import_structure = {
"SeamlessM4Tv2Config",
],
"models.segformer": ["SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "SegformerConfig"],
"models.seggpt": ["SEGGPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "SegGptConfig"],
"models.sew": ["SEW_PRETRAINED_CONFIG_ARCHIVE_MAP", "SEWConfig"],
"models.sew_d": ["SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP", "SEWDConfig"],
"models.siglip": [
@ -1316,6 +1317,7 @@ else:
_import_structure["models.pvt"].extend(["PvtImageProcessor"])
_import_structure["models.sam"].extend(["SamImageProcessor"])
_import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"])
_import_structure["models.seggpt"].extend(["SegGptImageProcessor"])
_import_structure["models.siglip"].append("SiglipImageProcessor")
_import_structure["models.swin2sr"].append("Swin2SRImageProcessor")
_import_structure["models.tvlt"].append("TvltImageProcessor")
@ -3192,6 +3194,14 @@ else:
"SegformerPreTrainedModel",
]
)
_import_structure["models.seggpt"].extend(
[
"SEGGPT_PRETRAINED_MODEL_ARCHIVE_LIST",
"SegGptForImageSegmentation",
"SegGptModel",
"SegGptPreTrainedModel",
]
)
_import_structure["models.sew"].extend(
[
"SEW_PRETRAINED_MODEL_ARCHIVE_LIST",
@ -5531,10 +5541,8 @@ if TYPE_CHECKING:
SEAMLESS_M4T_V2_PRETRAINED_CONFIG_ARCHIVE_MAP,
SeamlessM4Tv2Config,
)
from .models.segformer import (
SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
SegformerConfig,
)
from .models.segformer import SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, SegformerConfig
from .models.seggpt import SEGGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, SegGptConfig
from .models.sew import SEW_PRETRAINED_CONFIG_ARCHIVE_MAP, SEWConfig
from .models.sew_d import SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP, SEWDConfig
from .models.siglip import (
@ -6080,6 +6088,7 @@ if TYPE_CHECKING:
from .models.pvt import PvtImageProcessor
from .models.sam import SamImageProcessor
from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor
from .models.seggpt import SegGptImageProcessor
from .models.siglip import SiglipImageProcessor
from .models.swin2sr import Swin2SRImageProcessor
from .models.tvlt import TvltImageProcessor
@ -7635,6 +7644,12 @@ if TYPE_CHECKING:
SegformerModel,
SegformerPreTrainedModel,
)
from .models.seggpt import (
SEGGPT_PRETRAINED_MODEL_ARCHIVE_LIST,
SegGptForImageSegmentation,
SegGptModel,
SegGptPreTrainedModel,
)
from .models.sew import (
SEW_PRETRAINED_MODEL_ARCHIVE_LIST,
SEWForCTC,

View File

@ -194,6 +194,7 @@ from . import (
seamless_m4t,
seamless_m4t_v2,
segformer,
seggpt,
sew,
sew_d,
siglip,

View File

@ -202,6 +202,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("seamless_m4t", "SeamlessM4TConfig"),
("seamless_m4t_v2", "SeamlessM4Tv2Config"),
("segformer", "SegformerConfig"),
("seggpt", "SegGptConfig"),
("sew", "SEWConfig"),
("sew-d", "SEWDConfig"),
("siglip", "SiglipConfig"),
@ -428,6 +429,7 @@ CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
("seamless_m4t", "SEAMLESS_M4T_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("seamless_m4t_v2", "SEAMLESS_M4T_V2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("segformer", "SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("seggpt", "SEGGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("sew", "SEW_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("sew-d", "SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("siglip", "SIGLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@ -680,6 +682,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
("seamless_m4t", "SeamlessM4T"),
("seamless_m4t_v2", "SeamlessM4Tv2"),
("segformer", "SegFormer"),
("seggpt", "SegGPT"),
("sew", "SEW"),
("sew-d", "SEW-D"),
("siglip", "SigLIP"),

View File

@ -98,6 +98,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
("resnet", "ConvNextImageProcessor"),
("sam", "SamImageProcessor"),
("segformer", "SegformerImageProcessor"),
("seggpt", "SegGptImageProcessor"),
("siglip", "SiglipImageProcessor"),
("swiftformer", "ViTImageProcessor"),
("swin", "ViTImageProcessor"),

View File

@ -193,6 +193,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("seamless_m4t", "SeamlessM4TModel"),
("seamless_m4t_v2", "SeamlessM4Tv2Model"),
("segformer", "SegformerModel"),
("seggpt", "SegGptModel"),
("sew", "SEWModel"),
("sew-d", "SEWDModel"),
("siglip", "SiglipModel"),

View File

@ -0,0 +1,71 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {
"configuration_seggpt": ["SEGGPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "SegGptConfig", "SegGptOnnxConfig"]
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_seggpt"] = [
"SEGGPT_PRETRAINED_MODEL_ARCHIVE_LIST",
"SegGptModel",
"SegGptPreTrainedModel",
"SegGptForImageSegmentation",
]
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_seggpt"] = ["SegGptImageProcessor"]
if TYPE_CHECKING:
from .configuration_seggpt import SEGGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, SegGptConfig, SegGptOnnxConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_seggpt import (
SEGGPT_PRETRAINED_MODEL_ARCHIVE_LIST,
SegGptForImageSegmentation,
SegGptModel,
SegGptPreTrainedModel,
)
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_seggpt import SegGptImageProcessor
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

View File

@ -0,0 +1,145 @@
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" SegGpt model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
SEGGPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"BAAI/seggpt-vit-large": "https://huggingface.co/BAAI/seggpt-vit-large/resolve/main/config.json",
}
class SegGptConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`SegGptModel`]. It is used to instantiate a SegGPT
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the SegGPT
[BAAI/seggpt-vit-large](https://huggingface.co/BAAI/seggpt-vit-large) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
hidden_size (`int`, *optional*, defaults to 1024):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (`int`, *optional*, defaults to 24):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` are supported.
hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the layer normalization layers.
image_size (`List[int]`, *optional*, defaults to `[896, 448]`):
The size (resolution) of each image.
patch_size (`int`, *optional*, defaults to 16):
The size (resolution) of each patch.
num_channels (`int`, *optional*, defaults to 3):
The number of input channels.
qkv_bias (`bool`, *optional*, defaults to `True`):
Whether to add a bias to the queries, keys and values.
mlp_dim (`int`, *optional*):
The dimensionality of the MLP layer in the Transformer encoder. If unset, defaults to
`hidden_size` * 4.
drop_path_rate (`float`, *optional*, defaults to 0.1):
The drop path rate for the dropout layers.
pretrain_image_size (`int`, *optional*, defaults to 224):
The pretrained size of the absolute position embeddings.
decoder_hidden_size (`int`, *optional*, defaults to 64):
Hidden size for decoder.
use_relative_position_embeddings (`bool`, *optional*, defaults to `True`):
Whether to use relative position embeddings in the attention layers.
merge_index (`int`, *optional*, defaults to 2):
The index of the encoder layer to merge the embeddings.
intermediate_hidden_state_indices (`List[int]`, *optional*, defaults to `[5, 11, 17, 23]`):
The indices of the encoder layers which we store as features for the decoder.
beta (`float`, *optional*, defaults to 0.01):
Regularization factor for SegGptLoss (smooth-l1 loss).
Example:
```python
>>> from transformers import SegGptConfig, SegGptModel
>>> # Initializing a SegGPT seggpt-vit-large style configuration
>>> configuration = SegGptConfig()
>>> # Initializing a model (with random weights) from the seggpt-vit-large style configuration
>>> model = SegGptModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "seggpt"
def __init__(
self,
hidden_size=1024,
num_hidden_layers=24,
num_attention_heads=16,
hidden_act="gelu",
hidden_dropout_prob=0.0,
initializer_range=0.02,
layer_norm_eps=1e-6,
image_size=[896, 448],
patch_size=16,
num_channels=3,
qkv_bias=True,
mlp_dim=None,
drop_path_rate=0.1,
pretrain_image_size=224,
decoder_hidden_size=64,
use_relative_position_embeddings=True,
merge_index=2,
intermediate_hidden_state_indices=[5, 11, 17, 23],
beta=0.01,
**kwargs,
):
super().__init__(**kwargs)
if merge_index > min(intermediate_hidden_state_indices):
raise ValueError(
f"Merge index must be less than the minimum encoder output index, but got {merge_index=} and {intermediate_hidden_state_indices=}"
)
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.qkv_bias = qkv_bias
self.drop_path_rate = drop_path_rate
self.pretrain_image_size = pretrain_image_size
self.decoder_hidden_size = decoder_hidden_size
self.use_relative_position_embeddings = use_relative_position_embeddings
self.merge_index = merge_index
self.intermediate_hidden_state_indices = intermediate_hidden_state_indices
self.beta = beta
self.mlp_dim = int(hidden_size * 4) if mlp_dim is None else mlp_dim

View File

@ -0,0 +1,222 @@
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert SegGPT checkpoints from the original repository.
URL: https://github.com/baaivision/Painter/tree/main/SegGPT
"""
import argparse
import requests
import torch
from PIL import Image
from transformers import SegGptConfig, SegGptForImageSegmentation, SegGptImageProcessor
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
# here we list all keys to be renamed (original name on the left, our name on the right)
def create_rename_keys(config):
rename_keys = []
# fmt: off
# rename embedding and its parameters
rename_keys.append(("patch_embed.proj.weight", "model.embeddings.patch_embeddings.projection.weight"))
rename_keys.append(("patch_embed.proj.bias", "model.embeddings.patch_embeddings.projection.bias"))
rename_keys.append(("mask_token", "model.embeddings.mask_token"))
rename_keys.append(("segment_token_x", "model.embeddings.segment_token_input"))
rename_keys.append(("segment_token_y", "model.embeddings.segment_token_prompt"))
rename_keys.append(("type_token_cls", "model.embeddings.type_token_semantic"))
rename_keys.append(("type_token_ins", "model.embeddings.type_token_instance"))
rename_keys.append(("pos_embed", "model.embeddings.position_embeddings"))
# rename decoder and other
rename_keys.append(("norm.weight", "model.encoder.layernorm.weight"))
rename_keys.append(("norm.bias", "model.encoder.layernorm.bias"))
rename_keys.append(("decoder_embed.weight", "decoder.decoder_embed.weight"))
rename_keys.append(("decoder_embed.bias", "decoder.decoder_embed.bias"))
rename_keys.append(("decoder_pred.0.weight", "decoder.decoder_pred.conv.weight"))
rename_keys.append(("decoder_pred.0.bias", "decoder.decoder_pred.conv.bias"))
rename_keys.append(("decoder_pred.1.weight", "decoder.decoder_pred.layernorm.weight"))
rename_keys.append(("decoder_pred.1.bias", "decoder.decoder_pred.layernorm.bias"))
rename_keys.append(("decoder_pred.3.weight", "decoder.decoder_pred.head.weight"))
rename_keys.append(("decoder_pred.3.bias", "decoder.decoder_pred.head.bias"))
# rename blocks
for i in range(config.num_hidden_layers):
rename_keys.append((f"blocks.{i}.attn.qkv.weight", f"model.encoder.layers.{i}.attention.qkv.weight"))
rename_keys.append((f"blocks.{i}.attn.qkv.bias", f"model.encoder.layers.{i}.attention.qkv.bias"))
rename_keys.append((f"blocks.{i}.attn.proj.weight", f"model.encoder.layers.{i}.attention.proj.weight"))
rename_keys.append((f"blocks.{i}.attn.proj.bias", f"model.encoder.layers.{i}.attention.proj.bias"))
rename_keys.append((f"blocks.{i}.attn.rel_pos_h", f"model.encoder.layers.{i}.attention.rel_pos_h"))
rename_keys.append((f"blocks.{i}.attn.rel_pos_w", f"model.encoder.layers.{i}.attention.rel_pos_w"))
rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"model.encoder.layers.{i}.mlp.lin1.weight"))
rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"model.encoder.layers.{i}.mlp.lin1.bias"))
rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"model.encoder.layers.{i}.mlp.lin2.weight"))
rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"model.encoder.layers.{i}.mlp.lin2.bias"))
rename_keys.append((f"blocks.{i}.norm1.weight", f"model.encoder.layers.{i}.layernorm_before.weight"))
rename_keys.append((f"blocks.{i}.norm1.bias", f"model.encoder.layers.{i}.layernorm_before.bias"))
rename_keys.append((f"blocks.{i}.norm2.weight", f"model.encoder.layers.{i}.layernorm_after.weight"))
rename_keys.append((f"blocks.{i}.norm2.bias", f"model.encoder.layers.{i}.layernorm_after.bias"))
# fmt: on
return rename_keys
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
# We will verify our results on spongebob images
def prepare_input():
image_input_url = (
"https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_2.jpg"
)
image_prompt_url = (
"https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1.jpg"
)
mask_prompt_url = (
"https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1_target.png"
)
image_input = Image.open(requests.get(image_input_url, stream=True).raw)
image_prompt = Image.open(requests.get(image_prompt_url, stream=True).raw)
mask_prompt = Image.open(requests.get(mask_prompt_url, stream=True).raw)
return image_input, image_prompt, mask_prompt
@torch.no_grad()
def convert_seggpt_checkpoint(args):
model_name = args.model_name
pytorch_dump_folder_path = args.pytorch_dump_folder_path
verify_logits = args.verify_logits
push_to_hub = args.push_to_hub
# Define default GroundingDINO configuation
config = SegGptConfig()
# Load original checkpoint
checkpoint_url = "https://huggingface.co/BAAI/SegGpt/blob/main/seggpt_vit_large.pth"
original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]
# # Rename keys
new_state_dict = original_state_dict.copy()
rename_keys = create_rename_keys(config)
for src, dest in rename_keys:
rename_key(new_state_dict, src, dest)
# Load HF model
model = SegGptForImageSegmentation(config)
model.eval()
missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
print("Missing keys:", missing_keys)
print("Unexpected keys:", unexpected_keys)
input_img, prompt_img, prompt_mask = prepare_input()
image_processor = SegGptImageProcessor()
inputs = image_processor(images=input_img, prompt_images=prompt_img, prompt_masks=prompt_mask, return_tensors="pt")
expected_prompt_pixel_values = torch.tensor(
[
[[-0.6965, -0.6965, -0.6965], [-0.6965, -0.6965, -0.6965], [-0.6965, -0.6965, -0.6965]],
[[1.6583, 1.6583, 1.6583], [1.6583, 1.6583, 1.6583], [1.6583, 1.6583, 1.6583]],
[[2.3088, 2.3088, 2.3088], [2.3088, 2.3088, 2.3088], [2.3088, 2.3088, 2.3088]],
]
)
expected_pixel_values = torch.tensor(
[
[[1.6324, 1.6153, 1.5810], [1.6153, 1.5982, 1.5810], [1.5810, 1.5639, 1.5639]],
[[1.2731, 1.2556, 1.2206], [1.2556, 1.2381, 1.2031], [1.2206, 1.2031, 1.1681]],
[[1.6465, 1.6465, 1.6465], [1.6465, 1.6465, 1.6465], [1.6291, 1.6291, 1.6291]],
]
)
expected_prompt_masks = torch.tensor(
[
[[-2.1179, -2.1179, -2.1179], [-2.1179, -2.1179, -2.1179], [-2.1179, -2.1179, -2.1179]],
[[-2.0357, -2.0357, -2.0357], [-2.0357, -2.0357, -2.0357], [-2.0357, -2.0357, -2.0357]],
[[-1.8044, -1.8044, -1.8044], [-1.8044, -1.8044, -1.8044], [-1.8044, -1.8044, -1.8044]],
]
)
assert torch.allclose(inputs.pixel_values[0, :, :3, :3], expected_pixel_values, atol=1e-4)
assert torch.allclose(inputs.prompt_pixel_values[0, :, :3, :3], expected_prompt_pixel_values, atol=1e-4)
assert torch.allclose(inputs.prompt_masks[0, :, :3, :3], expected_prompt_masks, atol=1e-4)
torch.manual_seed(2)
outputs = model(**inputs)
print(outputs)
if verify_logits:
expected_output = torch.tensor(
[
[[-2.1208, -2.1190, -2.1198], [-2.1237, -2.1228, -2.1227], [-2.1232, -2.1226, -2.1228]],
[[-2.0405, -2.0396, -2.0403], [-2.0434, -2.0434, -2.0433], [-2.0428, -2.0432, -2.0434]],
[[-1.8102, -1.8088, -1.8099], [-1.8131, -1.8126, -1.8129], [-1.8130, -1.8128, -1.8131]],
]
)
assert torch.allclose(outputs.pred_masks[0, :, :3, :3], expected_output, atol=1e-4)
print("Looks good!")
else:
print("Converted without verifying logits")
if pytorch_dump_folder_path is not None:
print(f"Saving model and processor for {model_name} to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
image_processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
print(f"Pushing model and processor for {model_name} to hub")
model.push_to_hub(f"EduardoPacheco/{model_name}")
image_processor.push_to_hub(f"EduardoPacheco/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--model_name",
default="seggpt-vit-large",
type=str,
choices=["seggpt-vit-large"],
help="Name of the SegGpt model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
)
parser.add_argument(
"--verify_logits",
action="store_false",
help="Whether or not to verify the logits against the original implementation.",
)
parser.add_argument(
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
)
args = parser.parse_args()
convert_seggpt_checkpoint(args)

View File

@ -0,0 +1,626 @@
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for SegGPT."""
from typing import Dict, List, Optional, Tuple, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import resize, to_channel_dimension_format
from ...image_utils import (
IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
get_channel_dimension_axis,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
to_numpy_array,
valid_images,
)
from ...utils import TensorType, is_torch_available, logging, requires_backends
if is_torch_available():
import torch
logger = logging.get_logger(__name__)
# See https://arxiv.org/pdf/2212.02499.pdf at 3.1 Redefining Output Spaces as "Images" - Semantic Segmentation from PAINTER paper
# Taken from https://github.com/Abdullah-Meda/Painter/blob/main/Painter/data/coco_semseg/gen_color_coco_panoptic_segm.py#L31
def build_palette(num_labels: int) -> List[Tuple[int, int]]:
base = int(num_labels ** (1 / 3)) + 1
margin = 256 // base
# we assume that class_idx 0 is the background which is mapped to black
color_list = [(0, 0, 0)]
for location in range(num_labels):
num_seq_r = location // base**2
num_seq_g = (location % base**2) // base
num_seq_b = location % base
R = 255 - num_seq_r * margin
G = 255 - num_seq_g * margin
B = 255 - num_seq_b * margin
color_list.append((R, G, B))
return color_list
def get_num_channels(image: np.ndarray, input_data_format: ChannelDimension) -> int:
if image.ndim == 2:
return 0
channel_idx = get_channel_dimension_axis(image, input_data_format)
return image.shape[channel_idx]
def mask_to_rgb(
mask: np.ndarray,
palette: Optional[List[Tuple[int, int]]] = None,
input_data_format: Optional[ChannelDimension] = None,
data_format: Optional[ChannelDimension] = None,
) -> np.ndarray:
if input_data_format is None and mask.ndim > 2:
input_data_format = infer_channel_dimension_format(mask)
data_format = data_format if data_format is not None else input_data_format
num_channels = get_num_channels(mask, input_data_format)
if num_channels == 3:
return to_channel_dimension_format(mask, data_format, input_data_format) if data_format is not None else mask
if palette is not None:
height, width = mask.shape
rgb_mask = np.zeros((3, height, width), dtype=np.uint8)
classes_in_mask = np.unique(mask)
for class_idx in classes_in_mask:
rgb_value = palette[class_idx]
class_mask = (mask == class_idx).astype(np.uint8)
class_mask = np.expand_dims(class_mask, axis=-1)
class_rgb_mask = class_mask * np.array(rgb_value)
class_rgb_mask = np.moveaxis(class_rgb_mask, -1, 0)
rgb_mask += class_rgb_mask.astype(np.uint8)
rgb_mask = np.clip(rgb_mask, 0, 255).astype(np.uint8)
else:
rgb_mask = np.repeat(mask[None, ...], 3, axis=0)
return (
to_channel_dimension_format(rgb_mask, data_format, input_data_format) if data_format is not None else rgb_mask
)
class SegGptImageProcessor(BaseImageProcessor):
r"""
Constructs a SegGpt image processor.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
size (`dict`, *optional*, defaults to `{"height": 448, "width": 448}`):
Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
method.
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
`preprocess` method.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Optional[Dict[str, int]] = None,
resample: PILImageResampling = PILImageResampling.BICUBIC,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"height": 448, "width": 448}
size = get_size_dict(size)
self.do_resize = do_resize
self.do_rescale = do_rescale
self.do_normalize = do_normalize
self.size = size
self.resample = resample
self.rescale_factor = rescale_factor
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
def get_palette(self, num_labels: int) -> List[Tuple[int, int]]:
"""Build a palette to map the prompt mask from a single channel to a 3 channel RGB.
Args:
num_labels (`int`):
Number of classes in the segmentation task (excluding the background).
Returns:
`List[Tuple[int, int]]`: Palette to map the prompt mask from a single channel to a 3 channel RGB.
"""
return build_palette(num_labels)
def mask_to_rgb(
self,
image: np.ndarray,
palette: Optional[List[Tuple[int, int]]] = None,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""Convert a mask to RGB format.
Args:
image (`np.ndarray`):
Mask to convert to RGB format. If the mask is already in RGB format, it will be passed through.
palette (`List[Tuple[int, int]]`, *optional*, defaults to `None`):
Palette to use to convert the mask to RGB format. If unset, the mask is duplicated across the channel
dimension.
data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
Returns:
`np.ndarray`: The mask in RGB format.
"""
return mask_to_rgb(
image,
palette=palette,
data_format=data_format,
input_data_format=input_data_format,
)
# Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize an image to `(size["height"], size["width"])`.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
Returns:
`np.ndarray`: The resized image.
"""
size = get_size_dict(size)
if "height" not in size or "width" not in size:
raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
output_size = (size["height"], size["width"])
return resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
def _preprocess_step(
self,
images: ImageInput,
is_mask: bool = False,
do_resize: Optional[bool] = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
num_labels: Optional[int] = None,
**kwargs,
):
"""
Preprocess an image or batch of images.
Args:
images (`ImageInput`):
Image to _preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
is_mask (`bool`, *optional*, defaults to `False`):
Whether the image is a mask. If True, the image is converted to RGB using the palette if
`self.num_labels` is specified otherwise RGB is achieved by duplicating the channel.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
resizing.
resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
`PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BICUBIC`. Only has
an effect if `do_resize` is set to `True`.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image values between [0 - 1].
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
Image mean to use if `do_normalize` is set to `True`.
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation to use if `do_normalize` is set to `True`.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
num_labels: (`int`, *optional*):
Number of classes in the segmentation task (excluding the background). If specified, a palette will be
built, assuming that class_idx 0 is the background, to map the prompt mask from a single class_idx
channel to a 3 channel RGB. Not specifying this will result in the prompt mask either being passed
through as is if it is already in RGB format or being duplicated across the channel dimension.
"""
do_resize = do_resize if do_resize is not None else self.do_resize
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
resample = resample if resample is not None else self.resample
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
size = size if size is not None else self.size
size_dict = get_size_dict(size)
images = make_list_of_images(images)
if not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
if do_resize and size is None:
raise ValueError("Size must be specified if do_resize is True.")
if do_rescale and rescale_factor is None:
raise ValueError("Rescale factor must be specified if do_rescale is True.")
if do_normalize and (image_mean is None or image_std is None):
raise ValueError("Image mean and std must be specified if do_normalize is True.")
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
if is_scaled_image(images[0]) and do_rescale:
logger.warning_once(
"It looks like you are trying to rescale already rescaled images. If the input"
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
)
if input_data_format is None and not is_mask:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images[0])
if is_mask:
palette = self.get_palette(num_labels) if num_labels is not None else None
# Since this is the input for the next transformations its format should be the same as the input_data_format
images = [
self.mask_to_rgb(image=image, palette=palette, data_format=ChannelDimension.FIRST) for image in images
]
input_data_format = ChannelDimension.FIRST
if do_resize:
images = [
self.resize(image=image, size=size_dict, resample=resample, input_data_format=input_data_format)
for image in images
]
if do_rescale:
images = [
self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
for image in images
]
if do_normalize:
images = [
self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
for image in images
]
images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
]
return images
def preprocess(
self,
images: Optional[ImageInput] = None,
prompt_images: Optional[ImageInput] = None,
prompt_masks: Optional[ImageInput] = None,
do_resize: Optional[bool] = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
num_labels: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
):
"""
Preprocess an image or batch of images.
Args:
images (`ImageInput`):
Image to _preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
prompt_images (`ImageInput`):
Prompt image to _preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
prompt_masks (`ImageInput`):
Prompt mask from prompt image to _preprocess. Expects a single or batch of masks. If the mask masks are
a single channel then it will be converted to RGB using the palette if `self.num_labels` is specified
or by just repeating the channel if not. If the mask is already in RGB format, it will be passed through.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
resizing.
resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
`PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BICUBIC`. Only has
an effect if `do_resize` is set to `True`. Doesn't apply to prompt mask as it is resized using nearest.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image values between [0 - 1].
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
Image mean to use if `do_normalize` is set to `True`.
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation to use if `do_normalize` is set to `True`.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
num_labels: (`int`, *optional*):
Number of classes in the segmentation task (excluding the background). If specified, a palette will be
built, assuming that class_idx 0 is the background, to map the prompt mask from a single class_idx
channel to a 3 channel RGB. Not specifying this will result in the prompt mask either being passed
through as is if it is already in RGB format or being duplicated across the channel dimension.
"""
if all(v is None for v in [images, prompt_images, prompt_masks]):
raise ValueError("At least one of images, prompt_images, prompt_masks must be specified.")
data = {}
if images is not None:
images = self._preprocess_step(
images,
is_mask=False,
do_resize=do_resize,
size=size,
resample=resample,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
data["pixel_values"] = images
if prompt_images is not None:
prompt_images = self._preprocess_step(
prompt_images,
is_mask=False,
do_resize=do_resize,
size=size,
resample=resample,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
data["prompt_pixel_values"] = prompt_images
if prompt_masks is not None:
prompt_masks = self._preprocess_step(
prompt_masks,
is_mask=True,
do_resize=do_resize,
size=size,
resample=PILImageResampling.NEAREST,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
data_format=data_format,
input_data_format=input_data_format,
num_labels=num_labels,
**kwargs,
)
data["prompt_masks"] = prompt_masks
return BatchFeature(data=data, tensor_type=return_tensors)
def post_process_semantic_segmentation(
self, outputs, target_sizes: Optional[List[Tuple[int, int]]] = None, num_labels: Optional[int] = None
):
"""
Converts the output of [`SegGptImageSegmentationOutput`] into segmentation maps. Only supports
PyTorch.
Args:
outputs ([`SegGptImageSegmentationOutput`]):
Raw outputs of the model.
target_sizes (`List[Tuple[int, int]]`, *optional*):
List of length (batch_size), where each list item (`Tuple[int, int]`) corresponds to the requested
final size (height, width) of each prediction. If left to None, predictions will not be resized.
num_labels (`int`, *optional*):
Number of classes in the segmentation task (excluding the background). If specified, a palette will be
built, assuming that class_idx 0 is the background, to map prediction masks from RGB values to class
indices. This value should be the same used when preprocessing inputs.
Returns:
semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
"""
requires_backends(self, ["torch"])
# batch_size x num_channels x 2*height x width
masks = outputs.pred_masks
# Predicted mask and prompt are concatenated in the height dimension
# batch_size x num_channels x height x width
masks = masks[:, :, masks.shape[2] // 2 :, :]
# To unnormalize we need to permute to channel last
# batch_size x height x width x num_channels
std = torch.tensor(self.image_std).to(masks.device)
mean = torch.tensor(self.image_mean).to(masks.device)
masks = masks.permute(0, 2, 3, 1) * std + mean
# batch_size x num_channels x height x width
masks = masks.permute(0, 3, 1, 2)
# Clip to match with palette if specified
masks = torch.clip(masks * 255, 0, 255)
semantic_segmentation = []
palette_tensor = None
palette = self.get_palette(num_labels) if num_labels is not None else None
if palette is not None:
palette_tensor = torch.tensor(palette).float().to(masks.device)
_, num_channels, _, _ = masks.shape
palette_tensor = palette_tensor.view(1, 1, num_labels + 1, num_channels)
for idx, mask in enumerate(masks):
if target_sizes is not None:
mask = torch.nn.functional.interpolate(
mask.unsqueeze(0),
size=target_sizes[idx],
mode="nearest",
)[0]
if num_labels is not None:
channels, height, width = mask.shape
dist = mask.permute(1, 2, 0).view(height, width, 1, channels)
dist = dist - palette_tensor
dist = torch.pow(dist, 2)
dist = torch.sum(dist, dim=-1)
pred = dist.argmin(dim=-1)
else:
# If no palette is specified SegGpt will try to paint using the mask class idx as RGB
pred = mask.mean(dim=0).int()
semantic_segmentation.append(pred)
return semantic_segmentation

File diff suppressed because it is too large Load Diff

View File

@ -7556,6 +7556,30 @@ class SegformerPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"])
SEGGPT_PRETRAINED_MODEL_ARCHIVE_LIST = None
class SegGptForImageSegmentation(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class SegGptModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class SegGptPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
SEW_PRETRAINED_MODEL_ARCHIVE_LIST = None

View File

@ -471,6 +471,13 @@ class SegformerImageProcessor(metaclass=DummyObject):
requires_backends(self, ["vision"])
class SegGptImageProcessor(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
class SiglipImageProcessor(metaclass=DummyObject):
_backends = ["vision"]

View File

View File

@ -0,0 +1,231 @@
# coding=utf-8
# Copyright 2024 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from datasets import load_dataset
from transformers.testing_utils import require_torch, require_vision, slow
from transformers.utils import is_torch_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
if is_torch_available():
import torch
from transformers.models.seggpt.modeling_seggpt import SegGptImageSegmentationOutput
if is_vision_available():
from transformers import SegGptImageProcessor
class SegGptImageProcessingTester(unittest.TestCase):
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
image_size=18,
min_resolution=30,
max_resolution=400,
do_resize=True,
size=None,
do_normalize=True,
image_mean=[0.5, 0.5, 0.5],
image_std=[0.5, 0.5, 0.5],
):
size = size if size is not None else {"height": 18, "width": 18}
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
def prepare_image_processor_dict(self):
return {
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_normalize": self.do_normalize,
"do_resize": self.do_resize,
"size": self.size,
}
def expected_output_image_shape(self, images):
return self.num_channels, self.size["height"], self.size["width"]
def expected_post_processed_shape(self):
return self.size["height"] // 2, self.size["width"]
def get_fake_image_segmentation_output(self):
torch.manual_seed(42)
return SegGptImageSegmentationOutput(
pred_masks=torch.rand(self.batch_size, self.num_channels, self.size["height"], self.size["width"])
)
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
return prepare_image_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
def prepare_mask():
ds = load_dataset("EduardoPacheco/seggpt-example-data")["train"]
return ds[0]["mask"].convert("L")
def prepare_img():
ds = load_dataset("EduardoPacheco/seggpt-example-data")["train"]
images = [image.convert("RGB") for image in ds["image"]]
masks = [image.convert("RGB") for image in ds["mask"]]
return images, masks
@require_torch
@require_vision
class SegGptImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = SegGptImageProcessor if is_vision_available() else None
def setUp(self):
self.image_processor_tester = SegGptImageProcessingTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
def test_image_processor_from_dict_with_kwargs(self):
image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
self.assertEqual(image_processor.size, {"height": 18, "width": 18})
image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
def test_image_processor_palette(self):
num_labels = 3
image_processing = self.image_processing_class(**self.image_processor_dict)
palette = image_processing.get_palette(num_labels)
self.assertEqual(len(palette), num_labels + 1)
self.assertEqual(palette[0], (0, 0, 0))
def test_mask_equivalence(self):
image_processor = SegGptImageProcessor()
mask_binary = prepare_mask()
mask_rgb = mask_binary.convert("RGB")
inputs_binary = image_processor(images=None, prompt_masks=mask_binary, return_tensors="pt")
inputs_rgb = image_processor(images=None, prompt_masks=mask_rgb, return_tensors="pt")
self.assertTrue((inputs_binary["prompt_masks"] == inputs_rgb["prompt_masks"]).all().item())
def test_mask_to_rgb(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
mask = prepare_mask()
mask = np.array(mask)
mask = (mask > 0).astype(np.uint8)
def check_two_colors(image, color1=(0, 0, 0), color2=(255, 255, 255)):
pixels = image.transpose(1, 2, 0).reshape(-1, 3)
unique_colors = np.unique(pixels, axis=0)
if len(unique_colors) == 2 and (color1 in unique_colors) and (color2 in unique_colors):
return True
else:
return False
num_labels = 1
palette = image_processing.get_palette(num_labels)
# Should only duplicate repeat class indices map, hence only (0,0,0) and (1,1,1)
mask_duplicated = image_processing.mask_to_rgb(mask)
# Mask using palette, since only 1 class is present we have colors (0,0,0) and (255,255,255)
mask_painted = image_processing.mask_to_rgb(mask, palette=palette)
self.assertTrue(check_two_colors(mask_duplicated, color2=(1, 1, 1)))
self.assertTrue(check_two_colors(mask_painted, color2=(255, 255, 255)))
def test_post_processing_semantic_segmentation(self):
image_processor = self.image_processing_class(**self.image_processor_dict)
outputs = self.image_processor_tester.get_fake_image_segmentation_output()
post_processed = image_processor.post_process_semantic_segmentation(outputs)
self.assertEqual(len(post_processed), self.image_processor_tester.batch_size)
expected_semantic_map_shape = self.image_processor_tester.expected_post_processed_shape()
self.assertEqual(post_processed[0].shape, expected_semantic_map_shape)
@slow
def test_pixel_values(self):
images, masks = prepare_img()
input_image = images[1]
prompt_image = images[0]
prompt_mask = masks[0]
image_processor = SegGptImageProcessor.from_pretrained("BAAI/seggpt-vit-large")
inputs = image_processor(
images=input_image, prompt_images=prompt_image, prompt_masks=prompt_mask, return_tensors="pt"
)
# Verify pixel values
expected_prompt_pixel_values = torch.tensor(
[
[[-0.6965, -0.6965, -0.6965], [-0.6965, -0.6965, -0.6965], [-0.6965, -0.6965, -0.6965]],
[[1.6583, 1.6583, 1.6583], [1.6583, 1.6583, 1.6583], [1.6583, 1.6583, 1.6583]],
[[2.3088, 2.3088, 2.3088], [2.3088, 2.3088, 2.3088], [2.3088, 2.3088, 2.3088]],
]
)
expected_pixel_values = torch.tensor(
[
[[1.6324, 1.6153, 1.5810], [1.6153, 1.5982, 1.5810], [1.5810, 1.5639, 1.5639]],
[[1.2731, 1.2556, 1.2206], [1.2556, 1.2381, 1.2031], [1.2206, 1.2031, 1.1681]],
[[1.6465, 1.6465, 1.6465], [1.6465, 1.6465, 1.6465], [1.6291, 1.6291, 1.6291]],
]
)
expected_prompt_masks = torch.tensor(
[
[[-2.1179, -2.1179, -2.1179], [-2.1179, -2.1179, -2.1179], [-2.1179, -2.1179, -2.1179]],
[[-2.0357, -2.0357, -2.0357], [-2.0357, -2.0357, -2.0357], [-2.0357, -2.0357, -2.0357]],
[[-1.8044, -1.8044, -1.8044], [-1.8044, -1.8044, -1.8044], [-1.8044, -1.8044, -1.8044]],
]
)
self.assertTrue(torch.allclose(inputs.pixel_values[0, :, :3, :3], expected_pixel_values, atol=1e-4))
self.assertTrue(
torch.allclose(inputs.prompt_pixel_values[0, :, :3, :3], expected_prompt_pixel_values, atol=1e-4)
)
self.assertTrue(torch.allclose(inputs.prompt_masks[0, :, :3, :3], expected_prompt_masks, atol=1e-4))

View File

@ -0,0 +1,339 @@
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the PyTorch SegGpt model. """
import inspect
import unittest
from datasets import load_dataset
from transformers import SegGptConfig
from transformers.testing_utils import (
require_torch,
require_vision,
slow,
torch_device,
)
from transformers.utils import cached_property, is_torch_available, is_vision_available
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from torch import nn
from transformers import SegGptForImageSegmentation, SegGptModel
from transformers.models.seggpt.modeling_seggpt import SEGGPT_PRETRAINED_MODEL_ARCHIVE_LIST
if is_vision_available():
from transformers import SegGptImageProcessor
class SegGptModelTester:
def __init__(
self,
parent,
batch_size=2,
image_size=30,
patch_size=2,
num_channels=3,
is_training=False,
use_labels=True,
hidden_size=32,
num_hidden_layers=2,
num_attention_heads=4,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
initializer_range=0.02,
mlp_ratio=2.0,
merge_index=0,
intermediate_hidden_state_indices=[1],
pretrain_image_size=10,
decoder_hidden_size=10,
):
self.parent = parent
self.batch_size = batch_size
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.is_training = is_training
self.use_labels = use_labels
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.initializer_range = initializer_range
self.mlp_ratio = mlp_ratio
self.merge_index = merge_index
self.intermediate_hidden_state_indices = intermediate_hidden_state_indices
self.pretrain_image_size = pretrain_image_size
self.decoder_hidden_size = decoder_hidden_size
# in SegGpt, the seq length equals the number of patches (we don't use the [CLS] token)
num_patches = (image_size // patch_size) ** 2
self.seq_length = num_patches
def prepare_config_and_inputs(self):
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size // 2, self.image_size])
prompt_pixel_values = floats_tensor(
[self.batch_size, self.num_channels, self.image_size // 2, self.image_size]
)
prompt_masks = floats_tensor([self.batch_size, self.num_channels, self.image_size // 2, self.image_size])
labels = None
if self.use_labels:
labels = floats_tensor([self.batch_size, self.num_channels, self.image_size // 2, self.image_size])
config = self.get_config()
return config, pixel_values, prompt_pixel_values, prompt_masks, labels
def get_config(self):
return SegGptConfig(
image_size=self.image_size,
patch_size=self.patch_size,
num_channels=self.num_channels,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
initializer_range=self.initializer_range,
mlp_ratio=self.mlp_ratio,
merge_index=self.merge_index,
intermediate_hidden_state_indices=self.intermediate_hidden_state_indices,
pretrain_image_size=self.pretrain_image_size,
decoder_hidden_size=self.decoder_hidden_size,
)
def create_and_check_model(self, config, pixel_values, prompt_pixel_values, prompt_masks, labels):
model = SegGptModel(config=config)
model.to(torch_device)
model.eval()
result = model(pixel_values, prompt_pixel_values, prompt_masks)
self.parent.assertEqual(
result.last_hidden_state.shape,
(
self.batch_size,
self.image_size // self.patch_size,
self.image_size // self.patch_size,
self.hidden_size,
),
)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
pixel_values,
prompt_pixel_values,
prompt_masks,
labels,
) = config_and_inputs
inputs_dict = {
"pixel_values": pixel_values,
"prompt_pixel_values": prompt_pixel_values,
"prompt_masks": prompt_masks,
}
return config, inputs_dict
@require_torch
class SegGptModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
"""
Here we also overwrite some of the tests of test_modeling_common.py, as SegGpt does not use input_ids, inputs_embeds,
attention_mask and seq_length.
"""
all_model_classes = (SegGptModel, SegGptForImageSegmentation) if is_torch_available() else ()
fx_compatible = False
test_pruning = False
test_resize_embeddings = False
test_head_masking = False
test_torchscript = False
pipeline_model_mapping = (
{"feature-extraction": SegGptModel, "mask-generation": SegGptModel} if is_torch_available() else {}
)
def setUp(self):
self.model_tester = SegGptModelTester(self)
self.config_tester = ConfigTester(self, config_class=SegGptConfig, has_text_modality=False)
def test_config(self):
self.config_tester.run_common_tests()
@unittest.skip(reason="SegGpt does not use inputs_embeds")
def test_inputs_embeds(self):
pass
def test_model_common_attributes(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
expected_arg_names = ["pixel_values", "prompt_pixel_values", "prompt_masks"]
self.assertListEqual(arg_names[:3], expected_arg_names)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_hidden_states_output(self):
def check_hidden_states_output(inputs_dict, config, model_class):
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
expected_num_layers = getattr(
self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
)
self.assertEqual(len(hidden_states), expected_num_layers)
patch_height = patch_width = config.image_size // config.patch_size
self.assertListEqual(
list(hidden_states[0].shape[-3:]),
[patch_height, patch_width, self.model_tester.hidden_size],
)
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
inputs_dict["output_hidden_states"] = True
check_hidden_states_output(inputs_dict, config, model_class)
# check that output_hidden_states also work using config
del inputs_dict["output_hidden_states"]
config.output_hidden_states = True
check_hidden_states_output(inputs_dict, config, model_class)
@slow
def test_model_from_pretrained(self):
for model_name in SEGGPT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = SegGptModel.from_pretrained(model_name)
self.assertIsNotNone(model)
def prepare_img():
ds = load_dataset("EduardoPacheco/seggpt-example-data")["train"]
images = [image.convert("RGB") for image in ds["image"]]
masks = [image.convert("RGB") for image in ds["mask"]]
return images, masks
@require_torch
@require_vision
class SegGptModelIntegrationTest(unittest.TestCase):
@cached_property
def default_image_processor(self):
return SegGptImageProcessor.from_pretrained("BAAI/seggpt-vit-large") if is_vision_available() else None
@slow
def test_one_shot_inference(self):
model = SegGptForImageSegmentation.from_pretrained("BAAI/seggpt-vit-large").to(torch_device)
image_processor = self.default_image_processor
images, masks = prepare_img()
input_image = images[1]
prompt_image = images[0]
prompt_mask = masks[0]
inputs = image_processor(
images=input_image, prompt_images=prompt_image, prompt_masks=prompt_mask, return_tensors="pt"
)
inputs = inputs.to(torch_device)
# forward pass
with torch.no_grad():
outputs = model(**inputs)
# verify the logits
expected_shape = torch.Size((1, 3, 896, 448))
self.assertEqual(outputs.pred_masks.shape, expected_shape)
expected_slice = torch.tensor(
[
[[-2.1208, -2.1190, -2.1198], [-2.1237, -2.1228, -2.1227], [-2.1232, -2.1226, -2.1228]],
[[-2.0405, -2.0396, -2.0403], [-2.0434, -2.0434, -2.0433], [-2.0428, -2.0432, -2.0434]],
[[-1.8102, -1.8088, -1.8099], [-1.8131, -1.8126, -1.8129], [-1.8130, -1.8128, -1.8131]],
]
).to(torch_device)
self.assertTrue(torch.allclose(outputs.pred_masks[0, :, :3, :3], expected_slice, atol=1e-4))
result = image_processor.post_process_semantic_segmentation(outputs, [input_image.size[::-1]])[0]
result_expected_shape = torch.Size((170, 297))
expected_area = 1082
area = (result > 0).sum().item()
self.assertEqual(result.shape, result_expected_shape)
self.assertEqual(area, expected_area)
@slow
def test_few_shot_inference(self):
model = SegGptForImageSegmentation.from_pretrained("BAAI/seggpt-vit-large").to(torch_device)
image_processor = self.default_image_processor
images, masks = prepare_img()
input_images = [images[1]] * 2
prompt_images = [images[0], images[2]]
prompt_masks = [masks[0], masks[2]]
inputs = image_processor(
images=input_images, prompt_images=prompt_images, prompt_masks=prompt_masks, return_tensors="pt"
)
inputs = {k: v.to(torch_device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model(**inputs, feature_ensemble=True)
expected_shape = torch.Size((2, 3, 896, 448))
expected_slice = torch.tensor(
[
[[-2.1201, -2.1192, -2.1189], [-2.1217, -2.1210, -2.1204], [-2.1216, -2.1202, -2.1194]],
[[-2.0393, -2.0390, -2.0387], [-2.0402, -2.0402, -2.0397], [-2.0400, -2.0394, -2.0388]],
[[-1.8083, -1.8076, -1.8077], [-1.8105, -1.8102, -1.8099], [-1.8105, -1.8095, -1.8090]],
]
).to(torch_device)
self.assertEqual(outputs.pred_masks.shape, expected_shape)
self.assertTrue(torch.allclose(outputs.pred_masks[0, :, 448:451, :3], expected_slice, atol=4e-4))

View File

@ -958,6 +958,16 @@ class ModelTesterMixin:
traced_model = torch.jit.trace(
model, (input_ids, bbox), check_trace=False
) # when traced model is checked, an error is produced due to name mangling
elif (
"pixel_values" in inputs and "prompt_pixel_values" in inputs and "prompt_masks" in inputs
): # SegGpt requires additional inputs
pixel_values = inputs["pixel_values"]
prompt_pixel_values = inputs["prompt_pixel_values"]
prompt_masks = inputs["prompt_masks"]
model(pixel_values, prompt_pixel_values, prompt_masks)
traced_model = torch.jit.trace(
model, (pixel_values, prompt_pixel_values, prompt_masks), check_trace=False
) # when traced model is checked, an error is produced due to name mangling
else:
main_input = inputs[main_input_name]

View File

@ -308,6 +308,7 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
"SeamlessM4Tv2NARTextToUnitForConditionalGeneration",
"SeamlessM4Tv2CodeHifiGan",
"SeamlessM4Tv2ForSpeechToSpeech", # no auto class for speech-to-speech
"SegGptForImageSegmentation",
"SiglipVisionModel",
"SiglipTextModel",
]