mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
Add swin transformer v2 (#17469)
* Add files generated using transformer-cli add-new-model-like command * Add changes for swinv2 attention and forward method * Add fixes * Add modifications for weight conversion and remaining args in swin model * Add changes for patchmerging * Add changes for SwinV2selfattention * Update conversion script * Add final fixes for the swin_v2 model * Add changes for conversion script for pretrained window size case * Add pretrained window size value from config in SwinV2Encoder class * Make fixup * Add swinv2 to models_not_in_readme to utils/check_copies.py * Modify Swinv2v2 to Swin Transformer V2 * Remove copied from, to run make fixup command * Add updates to swinv2tf from main branch * Add pretrained_window_size to config, to make tests pass * Add modified weights from nandwalritik profile for swinv2 * Update model weights from swinv2 from nandwalritik profile * Add fix for build_pr_documentation CI fix * Add fixes for weight conversion * Add change to make input with padding work * Add fixes for test cases * Add few changes from swin to swinv2 to pass test cases * Remove tests for tensorflow as swinv2 for TF is not added yet * Overide test_pt_tf_model_equivalence function as TF implementation for swinv2 is not added yet * Add modeling_tf_swinv2 to _ignore_modules as test file is removed for this one right now. * Update docs url for swinv2 in README.md Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Undo changes for check_repo * Update url in readme.md * Remove overrided function to test pt_tf_model_equivalence * Remove TF model imports for Swinv2 as its not implemented in this PR * Add changes for index.mdx * Add swinv2 papers link,abstract and contributors details * Rename cpb_mlp to continous_position_bias_mlp * Add tips for swinv2 model * Update src/transformers/models/swinv2/configuration_swinv2.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/swinv2/configuration_swinv2.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Fix indentation for docstring example in src/transformers/models/swinv2/configuration_swinv2.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update import order in src/transformers/models/swinv2/configuration_swinv2.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Add copyright statements in weights conversion script. Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Remove Swinv2 from models_not_in_readme * Reformat code * Remove TF implementation file for swinv2 * Update start docstring. Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Add changes for docstring * Update orgname for weights to microsoft * Remove to_2tuple function * Add copied from statements wherever applicable * Add copied from to Swinv2ForMaskedImageModelling class * Reformat code. Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Add unittest.skip(with reason.) for test_inputs_embeds test case. Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Add updates for test_modeling_swinv2.py * Add @unittest.skip() annotation for clarity to create_and_test_config_common_properties function * Add continuous_position_bias_mlp parameter to conversion script * Add test for testing masked_image_modelling for swinv2 * Update Swinv2 to Swin Transformer v2 in docs/source/en/model_doc/swinv2.mdx Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update Swinv2 to Swin Transformer v2 in docs/source/en/model_doc/swinv2.mdx Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update docs/source/en/model_doc/swinv2.mdx Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update docs/source/en/model_doc/swinv2.mdx Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Add suggested changes * Add copied from to forward methods of Swinv2Stage and Swinv2Encoder * Add push_to_hub flag to weight conversion script * Change order or Swinv2DropPath class * Add id2label mapping for imagenet 21k * Add updated url for SwinV2 functions and classes used in implementation * Update input_feature dimensions format, mentioned in comments. Co-authored-by: Alara Dirik <8944735+alaradirik@users.noreply.github.com> * Add suggested changes for modeling_swin2.py * Update docs * Remove create_and_test_config_common_properties function, as test_model_common_attributes is sufficient. * Fix indentation. Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Add changes for making Nit objects in code style * Add suggested changes * Add suggested changes for test_modelling_swinv2 * make fix-copies * Update docs/source/en/model_doc/swinv2.mdx Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Co-authored-by: Alara Dirik <8944735+alaradirik@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
parent
c89a592e87
commit
e87ac9d18b
@ -356,6 +356,7 @@ Current number of checkpoints: ** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/main/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
|
||||
1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
|
||||
|
@ -312,6 +312,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
|
||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/main/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
|
||||
1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
|
||||
|
@ -336,6 +336,7 @@ conda install -c huggingface transformers
|
||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (来自 Tel Aviv University) 伴随论文 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 由 Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 发布。
|
||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (来自 Berkeley) 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (来自 Microsoft) 伴随论文 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 由 Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo 发布。
|
||||
1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/main/model_doc/swinv2)** (来自 Microsoft) 伴随论文 [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) 由 Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo 发布。
|
||||
1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (来自 Google AI) 伴随论文 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
|
||||
1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (来自 Google AI) 伴随论文 [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
|
||||
1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。
|
||||
|
@ -348,6 +348,7 @@ conda install -c huggingface transformers
|
||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University) released with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/main/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
|
||||
1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released with the paper [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
|
||||
|
@ -378,6 +378,8 @@
|
||||
title: SqueezeBERT
|
||||
- local: model_doc/swin
|
||||
title: Swin Transformer
|
||||
- local: model_doc/swinv2
|
||||
title: Swin Transformer V2
|
||||
- local: model_doc/t5
|
||||
title: T5
|
||||
- local: model_doc/t5v1.1
|
||||
|
@ -154,6 +154,7 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret
|
||||
1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
||||
1. **[SqueezeBERT](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||
1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
1. **[Swin Transformer V2](model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
|
||||
1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
|
||||
@ -289,6 +290,7 @@ Flax), PyTorch, and/or TensorFlow.
|
||||
| Splinter | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| SqueezeBERT | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| Swin Transformer | ❌ | ❌ | ✅ | ✅ | ❌ |
|
||||
| Swin Transformer V2 | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| T5 | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| TAPAS | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| Trajectory Transformer | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
|
47
docs/source/en/model_doc/swinv2.mdx
Normal file
47
docs/source/en/model_doc/swinv2.mdx
Normal file
@ -0,0 +1,47 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Swin Transformer V2
|
||||
|
||||
## Overview
|
||||
|
||||
The Swin Transformer V2 model was proposed in [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Large-scale NLP models have been shown to significantly improve the performance on language tasks with no signs of saturation. They also demonstrate amazing few-shot capabilities like that of human beings. This paper aims to explore large-scale models in computer vision. We tackle three major issues in training and application of large vision models, including training instability, resolution gaps between pre-training and fine-tuning, and hunger on labelled data. Three main techniques are proposed: 1) a residual-post-norm method combined with cosine attention to improve training stability; 2) A log-spaced continuous position bias method to effectively transfer models pre-trained using low-resolution images to downstream tasks with high-resolution inputs; 3) A self-supervised pre-training method, SimMIM, to reduce the needs of vast labeled images. Through these techniques, this paper successfully trained a 3 billion-parameter Swin Transformer V2 model, which is the largest dense vision model to date, and makes it capable of training with images of up to 1,536×1,536 resolution. It set new performance records on 4 representative vision tasks, including ImageNet-V2 image classification, COCO object detection, ADE20K semantic segmentation, and Kinetics-400 video action classification. Also note our training is much more efficient than that in Google's billion-level visual models, which consumes 40 times less labelled data and 40 times less training time.*
|
||||
|
||||
Tips:
|
||||
- One can use the [`AutoFeatureExtractor`] API to prepare images for the model.
|
||||
|
||||
This model was contributed by [nandwalritik](https://huggingface.co/nandwalritik).
|
||||
The original code can be found [here](https://github.com/microsoft/Swin-Transformer).
|
||||
|
||||
|
||||
## Swinv2Config
|
||||
|
||||
[[autodoc]] Swinv2Config
|
||||
|
||||
## Swinv2Model
|
||||
|
||||
[[autodoc]] Swinv2Model
|
||||
- forward
|
||||
|
||||
## Swinv2ForMaskedImageModeling
|
||||
|
||||
[[autodoc]] Swinv2ForMaskedImageModeling
|
||||
- forward
|
||||
|
||||
## Swinv2ForImageClassification
|
||||
|
||||
[[autodoc]] transformers.Swinv2ForImageClassification
|
||||
- forward
|
@ -313,6 +313,7 @@ _import_structure = {
|
||||
"models.splinter": ["SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP", "SplinterConfig", "SplinterTokenizer"],
|
||||
"models.squeezebert": ["SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "SqueezeBertConfig", "SqueezeBertTokenizer"],
|
||||
"models.swin": ["SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP", "SwinConfig"],
|
||||
"models.swinv2": ["SWINV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Swinv2Config"],
|
||||
"models.t5": ["T5_PRETRAINED_CONFIG_ARCHIVE_MAP", "T5Config"],
|
||||
"models.tapas": ["TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP", "TapasConfig", "TapasTokenizer"],
|
||||
"models.tapex": ["TapexTokenizer"],
|
||||
@ -1750,6 +1751,15 @@ else:
|
||||
"SwinPreTrainedModel",
|
||||
]
|
||||
)
|
||||
_import_structure["models.swinv2"].extend(
|
||||
[
|
||||
"SWINV2_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
"Swinv2ForImageClassification",
|
||||
"Swinv2ForMaskedImageModeling",
|
||||
"Swinv2Model",
|
||||
"Swinv2PreTrainedModel",
|
||||
]
|
||||
)
|
||||
_import_structure["models.t5"].extend(
|
||||
[
|
||||
"T5_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
@ -3068,6 +3078,7 @@ if TYPE_CHECKING:
|
||||
from .models.splinter import SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP, SplinterConfig, SplinterTokenizer
|
||||
from .models.squeezebert import SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, SqueezeBertConfig, SqueezeBertTokenizer
|
||||
from .models.swin import SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP, SwinConfig
|
||||
from .models.swinv2 import SWINV2_PRETRAINED_CONFIG_ARCHIVE_MAP, Swinv2Config
|
||||
from .models.t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
|
||||
from .models.tapas import TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP, TapasConfig, TapasTokenizer
|
||||
from .models.tapex import TapexTokenizer
|
||||
@ -4265,6 +4276,13 @@ if TYPE_CHECKING:
|
||||
SwinModel,
|
||||
SwinPreTrainedModel,
|
||||
)
|
||||
from .models.swinv2 import (
|
||||
SWINV2_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
Swinv2ForImageClassification,
|
||||
Swinv2ForMaskedImageModeling,
|
||||
Swinv2Model,
|
||||
Swinv2PreTrainedModel,
|
||||
)
|
||||
from .models.t5 import (
|
||||
T5_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
T5EncoderModel,
|
||||
|
@ -127,6 +127,7 @@ from . import (
|
||||
splinter,
|
||||
squeezebert,
|
||||
swin,
|
||||
swinv2,
|
||||
t5,
|
||||
tapas,
|
||||
tapex,
|
||||
|
@ -123,6 +123,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
|
||||
("splinter", "SplinterConfig"),
|
||||
("squeezebert", "SqueezeBertConfig"),
|
||||
("swin", "SwinConfig"),
|
||||
("swinv2", "Swinv2Config"),
|
||||
("t5", "T5Config"),
|
||||
("tapas", "TapasConfig"),
|
||||
("trajectory_transformer", "TrajectoryTransformerConfig"),
|
||||
@ -239,6 +240,7 @@ CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
|
||||
("splinter", "SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
|
||||
("squeezebert", "SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
|
||||
("swin", "SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
|
||||
("swinv2", "SWINV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
|
||||
("t5", "T5_PRETRAINED_CONFIG_ARCHIVE_MAP"),
|
||||
("tapas", "TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
|
||||
("transfo-xl", "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
|
||||
@ -374,6 +376,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
|
||||
("splinter", "Splinter"),
|
||||
("squeezebert", "SqueezeBERT"),
|
||||
("swin", "Swin Transformer"),
|
||||
("swinv2", "Swin Transformer V2"),
|
||||
("t5", "T5"),
|
||||
("t5v1.1", "T5v1.1"),
|
||||
("tapas", "TAPAS"),
|
||||
|
@ -66,6 +66,7 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
|
||||
("segformer", "SegformerFeatureExtractor"),
|
||||
("speech_to_text", "Speech2TextFeatureExtractor"),
|
||||
("swin", "ViTFeatureExtractor"),
|
||||
("swinv2", "ViTFeatureExtractor"),
|
||||
("van", "ConvNextFeatureExtractor"),
|
||||
("vilt", "ViltFeatureExtractor"),
|
||||
("vit", "ViTFeatureExtractor"),
|
||||
|
@ -119,6 +119,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("splinter", "SplinterModel"),
|
||||
("squeezebert", "SqueezeBertModel"),
|
||||
("swin", "SwinModel"),
|
||||
("swinv2", "Swinv2Model"),
|
||||
("t5", "T5Model"),
|
||||
("tapas", "TapasModel"),
|
||||
("trajectory_transformer", "TrajectoryTransformerModel"),
|
||||
@ -309,6 +310,7 @@ MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES = OrderedDict(
|
||||
[
|
||||
("deit", "DeiTForMaskedImageModeling"),
|
||||
("swin", "SwinForMaskedImageModeling"),
|
||||
("swinv2", "Swinv2ForMaskedImageModeling"),
|
||||
("vit", "ViTForMaskedImageModeling"),
|
||||
]
|
||||
)
|
||||
@ -345,6 +347,7 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
("resnet", "ResNetForImageClassification"),
|
||||
("segformer", "SegformerForImageClassification"),
|
||||
("swin", "SwinForImageClassification"),
|
||||
("swinv2", "Swinv2ForImageClassification"),
|
||||
("van", "VanForImageClassification"),
|
||||
("vit", "ViTForImageClassification"),
|
||||
]
|
||||
|
65
src/transformers/models/swinv2/__init__.py
Normal file
65
src/transformers/models/swinv2/__init__.py
Normal file
@ -0,0 +1,65 @@
|
||||
# flake8: noqa
|
||||
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
||||
# module, but to preserve other warnings. So, don't check this module at all.
|
||||
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
# rely on isort to merge the imports
|
||||
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
|
||||
|
||||
|
||||
_import_structure = {
|
||||
"configuration_swinv2": ["SWINV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Swinv2Config"],
|
||||
}
|
||||
|
||||
|
||||
try:
|
||||
if not is_torch_available():
|
||||
raise OptionalDependencyNotAvailable()
|
||||
except OptionalDependencyNotAvailable:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_swinv2"] = [
|
||||
"SWINV2_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
"Swinv2ForImageClassification",
|
||||
"Swinv2ForMaskedImageModeling",
|
||||
"Swinv2Model",
|
||||
"Swinv2PreTrainedModel",
|
||||
]
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_swinv2 import SWINV2_PRETRAINED_CONFIG_ARCHIVE_MAP, Swinv2Config
|
||||
|
||||
try:
|
||||
if not is_torch_available():
|
||||
raise OptionalDependencyNotAvailable()
|
||||
except OptionalDependencyNotAvailable:
|
||||
pass
|
||||
else:
|
||||
from .modeling_swinv2 import (
|
||||
SWINV2_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
Swinv2ForImageClassification,
|
||||
Swinv2ForMaskedImageModeling,
|
||||
Swinv2Model,
|
||||
Swinv2PreTrainedModel,
|
||||
)
|
||||
|
||||
|
||||
else:
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|
147
src/transformers/models/swinv2/configuration_swinv2.py
Normal file
147
src/transformers/models/swinv2/configuration_swinv2.py
Normal file
@ -0,0 +1,147 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Swinv2 Transformer model configuration"""
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
SWINV2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
"microsoft/swinv2_tiny_patch4_windows8_256": (
|
||||
"https://huggingface.co/microsoft/swinv2_tiny_patch4_windows8_256/resolve/main/config.json"
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
class Swinv2Config(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`Swinv2Model`]. It is used to instantiate a Swin
|
||||
Transformer v2 model according to the specified arguments, defining the model architecture. Instantiating a
|
||||
configuration with the defaults will yield a similar configuration to that of the Swin Transformer v2
|
||||
[microsoft/swinv2_tiny_patch4_windows8_256](https://huggingface.co/microsoft/swinv2_tiny_patch4_windows8_256)
|
||||
architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
image_size (`int`, *optional*, defaults to 224):
|
||||
The size (resolution) of each image.
|
||||
patch_size (`int`, *optional*, defaults to 4):
|
||||
The size (resolution) of each patch.
|
||||
num_channels (`int`, *optional*, defaults to 3):
|
||||
The number of input channels.
|
||||
embed_dim (`int`, *optional*, defaults to 96):
|
||||
Dimensionality of patch embedding.
|
||||
depths (`list(int)`, *optional*, defaults to `[2, 2, 6, 2]`):
|
||||
Depth of each layer in the Transformer encoder.
|
||||
num_heads (`list(int)`, *optional*, defaults to `[3, 6, 12, 24]`):
|
||||
Number of attention heads in each layer of the Transformer encoder.
|
||||
window_size (`int`, *optional*, defaults to 7):
|
||||
Size of windows.
|
||||
mlp_ratio (`float`, *optional*, defaults to 4.0):
|
||||
Ratio of MLP hidden dimensionality to embedding dimensionality.
|
||||
qkv_bias (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not a learnable bias should be added to the queries, keys and values.
|
||||
hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probability for all fully connected layers in the embeddings and encoder.
|
||||
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
drop_path_rate (`float`, *optional*, defaults to 0.1):
|
||||
Stochastic depth rate.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
|
||||
`"selu"` and `"gelu_new"` are supported.
|
||||
use_absolute_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to add absolute position embeddings to the patch embeddings.
|
||||
patch_norm (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to add layer normalization after patch embedding.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
encoder_stride (`int`, `optional`, defaults to 32):
|
||||
Factor to increase the spatial resolution by in the decoder head for masked image modeling.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import Swinv2Config, Swinv2Model
|
||||
|
||||
>>> # Initializing a Swinv2 microsoft/swinv2_tiny_patch4_windows8_256 style configuration
|
||||
>>> configuration = Swinv2Config()
|
||||
|
||||
>>> # Initializing a model from the microsoft/swinv2_tiny_patch4_windows8_256 style configuration
|
||||
>>> model = Swinv2Model(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
model_type = "swinv2"
|
||||
|
||||
attribute_map = {
|
||||
"num_attention_heads": "num_heads",
|
||||
"num_hidden_layers": "num_layers",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
image_size=224,
|
||||
patch_size=4,
|
||||
num_channels=3,
|
||||
embed_dim=96,
|
||||
depths=[2, 2, 6, 2],
|
||||
num_heads=[3, 6, 12, 24],
|
||||
window_size=7,
|
||||
mlp_ratio=4.0,
|
||||
qkv_bias=True,
|
||||
hidden_dropout_prob=0.0,
|
||||
attention_probs_dropout_prob=0.0,
|
||||
drop_path_rate=0.1,
|
||||
hidden_act="gelu",
|
||||
use_absolute_embeddings=False,
|
||||
patch_norm=True,
|
||||
initializer_range=0.02,
|
||||
layer_norm_eps=1e-5,
|
||||
encoder_stride=32,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.image_size = image_size
|
||||
self.patch_size = patch_size
|
||||
self.num_channels = num_channels
|
||||
self.embed_dim = embed_dim
|
||||
self.depths = depths
|
||||
self.num_layers = len(depths)
|
||||
self.num_heads = num_heads
|
||||
self.window_size = window_size
|
||||
self.mlp_ratio = mlp_ratio
|
||||
self.qkv_bias = qkv_bias
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.drop_path_rate = drop_path_rate
|
||||
self.hidden_act = hidden_act
|
||||
self.use_absolute_embeddings = use_absolute_embeddings
|
||||
self.path_norm = patch_norm
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.initializer_range = initializer_range
|
||||
self.encoder_stride = encoder_stride
|
||||
# we set the hidden_size attribute in order to make Swinv2 work with VisionEncoderDecoderModel
|
||||
# this indicates the channel dimension after the last stage of the model
|
||||
self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
|
||||
self.pretrained_window_sizes = (0, 0, 0, 0)
|
219
src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py
Normal file
219
src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py
Normal file
@ -0,0 +1,219 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Convert Swinv2 checkpoints from the timm library."""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
import requests
|
||||
import timm
|
||||
from huggingface_hub import hf_hub_download
|
||||
from transformers import AutoFeatureExtractor, Swinv2Config, Swinv2ForImageClassification
|
||||
|
||||
|
||||
def get_swinv2_config(swinv2_name):
|
||||
config = Swinv2Config()
|
||||
name_split = swinv2_name.split("_")
|
||||
|
||||
model_size = name_split[1]
|
||||
if "to" in name_split[3]:
|
||||
img_size = int(name_split[3][-3:])
|
||||
else:
|
||||
img_size = int(name_split[3])
|
||||
if "to" in name_split[2]:
|
||||
window_size = int(name_split[2][-2:])
|
||||
else:
|
||||
window_size = int(name_split[2][6:])
|
||||
|
||||
if model_size == "tiny":
|
||||
embed_dim = 96
|
||||
depths = (2, 2, 6, 2)
|
||||
num_heads = (3, 6, 12, 24)
|
||||
elif model_size == "small":
|
||||
embed_dim = 96
|
||||
depths = (2, 2, 18, 2)
|
||||
num_heads = (3, 6, 12, 24)
|
||||
elif model_size == "base":
|
||||
embed_dim = 128
|
||||
depths = (2, 2, 18, 2)
|
||||
num_heads = (4, 8, 16, 32)
|
||||
else:
|
||||
embed_dim = 192
|
||||
depths = (2, 2, 18, 2)
|
||||
num_heads = (6, 12, 24, 48)
|
||||
|
||||
if "to" in swinv2_name:
|
||||
config.pretrained_window_sizes = (12, 12, 12, 6)
|
||||
|
||||
if ("22k" in swinv2_name) and ("to" not in swinv2_name):
|
||||
num_classes = 21841
|
||||
repo_id = "datasets/huggingface/label-files"
|
||||
filename = "imagenet-22k-id2label.json"
|
||||
id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
|
||||
id2label = {int(k): v for k, v in id2label.items()}
|
||||
config.id2label = id2label
|
||||
config.label2id = {v: k for k, v in id2label.items()}
|
||||
|
||||
else:
|
||||
num_classes = 1000
|
||||
repo_id = "datasets/huggingface/label-files"
|
||||
filename = "imagenet-1k-id2label.json"
|
||||
id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
|
||||
id2label = {int(k): v for k, v in id2label.items()}
|
||||
config.id2label = id2label
|
||||
config.label2id = {v: k for k, v in id2label.items()}
|
||||
|
||||
config.image_size = img_size
|
||||
config.num_labels = num_classes
|
||||
config.embed_dim = embed_dim
|
||||
config.depths = depths
|
||||
config.num_heads = num_heads
|
||||
config.window_size = window_size
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def rename_key(name):
|
||||
if "patch_embed.proj" in name:
|
||||
name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
|
||||
if "patch_embed.norm" in name:
|
||||
name = name.replace("patch_embed.norm", "embeddings.norm")
|
||||
if "layers" in name:
|
||||
name = "encoder." + name
|
||||
if "attn.proj" in name:
|
||||
name = name.replace("attn.proj", "attention.output.dense")
|
||||
if "attn" in name:
|
||||
name = name.replace("attn", "attention.self")
|
||||
if "norm1" in name:
|
||||
name = name.replace("norm1", "layernorm_before")
|
||||
if "norm2" in name:
|
||||
name = name.replace("norm2", "layernorm_after")
|
||||
if "mlp.fc1" in name:
|
||||
name = name.replace("mlp.fc1", "intermediate.dense")
|
||||
if "mlp.fc2" in name:
|
||||
name = name.replace("mlp.fc2", "output.dense")
|
||||
if "q_bias" in name:
|
||||
name = name.replace("q_bias", "query.bias")
|
||||
if "k_bias" in name:
|
||||
name = name.replace("k_bias", "key.bias")
|
||||
if "v_bias" in name:
|
||||
name = name.replace("v_bias", "value.bias")
|
||||
if "cpb_mlp" in name:
|
||||
name = name.replace("cpb_mlp", "continuous_position_bias_mlp")
|
||||
if name == "norm.weight":
|
||||
name = "layernorm.weight"
|
||||
if name == "norm.bias":
|
||||
name = "layernorm.bias"
|
||||
|
||||
if "head" in name:
|
||||
name = name.replace("head", "classifier")
|
||||
else:
|
||||
name = "swinv2." + name
|
||||
|
||||
return name
|
||||
|
||||
|
||||
def convert_state_dict(orig_state_dict, model):
|
||||
for key in orig_state_dict.copy().keys():
|
||||
val = orig_state_dict.pop(key)
|
||||
|
||||
if "mask" in key:
|
||||
continue
|
||||
elif "qkv" in key:
|
||||
key_split = key.split(".")
|
||||
layer_num = int(key_split[1])
|
||||
block_num = int(key_split[3])
|
||||
dim = model.swinv2.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size
|
||||
|
||||
if "weight" in key:
|
||||
orig_state_dict[
|
||||
f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"
|
||||
] = val[:dim, :]
|
||||
orig_state_dict[
|
||||
f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"
|
||||
] = val[dim : dim * 2, :]
|
||||
orig_state_dict[
|
||||
f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"
|
||||
] = val[-dim:, :]
|
||||
else:
|
||||
orig_state_dict[
|
||||
f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"
|
||||
] = val[:dim]
|
||||
orig_state_dict[f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"] = val[
|
||||
dim : dim * 2
|
||||
]
|
||||
orig_state_dict[
|
||||
f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"
|
||||
] = val[-dim:]
|
||||
else:
|
||||
orig_state_dict[rename_key(key)] = val
|
||||
|
||||
return orig_state_dict
|
||||
|
||||
|
||||
def convert_swinv2_checkpoint(swinv2_name, pytorch_dump_folder_path):
|
||||
timm_model = timm.create_model(swinv2_name, pretrained=True)
|
||||
timm_model.eval()
|
||||
|
||||
config = get_swinv2_config(swinv2_name)
|
||||
model = Swinv2ForImageClassification(config)
|
||||
model.eval()
|
||||
|
||||
new_state_dict = convert_state_dict(timm_model.state_dict(), model)
|
||||
model.load_state_dict(new_state_dict)
|
||||
|
||||
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
|
||||
feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/{}".format(swinv2_name.replace("_", "-")))
|
||||
image = Image.open(requests.get(url, stream=True).raw)
|
||||
inputs = feature_extractor(images=image, return_tensors="pt")
|
||||
|
||||
timm_outs = timm_model(inputs["pixel_values"])
|
||||
hf_outs = model(**inputs).logits
|
||||
|
||||
assert torch.allclose(timm_outs, hf_outs, atol=1e-3)
|
||||
|
||||
print(f"Saving model {swinv2_name} to {pytorch_dump_folder_path}")
|
||||
model.save_pretrained(pytorch_dump_folder_path)
|
||||
|
||||
print(f"Saving feature extractor to {pytorch_dump_folder_path}")
|
||||
feature_extractor.save_pretrained(pytorch_dump_folder_path)
|
||||
|
||||
model.push_to_hub(
|
||||
repo_path_or_name=Path(pytorch_dump_folder_path, swinv2_name),
|
||||
organization="nandwalritik",
|
||||
commit_message="Add model",
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
# Required parameters
|
||||
parser.add_argument(
|
||||
"--swinv2_name",
|
||||
default="swinv2_tiny_patch4_window8_256",
|
||||
type=str,
|
||||
help="Name of the Swinv2 timm model you'd like to convert.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
convert_swinv2_checkpoint(args.swinv2_name, args.pytorch_dump_folder_path)
|
1292
src/transformers/models/swinv2/modeling_swinv2.py
Normal file
1292
src/transformers/models/swinv2/modeling_swinv2.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -4469,6 +4469,37 @@ class SwinPreTrainedModel(metaclass=DummyObject):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
SWINV2_PRETRAINED_MODEL_ARCHIVE_LIST = None
|
||||
|
||||
|
||||
class Swinv2ForImageClassification(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class Swinv2ForMaskedImageModeling(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class Swinv2Model(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class Swinv2PreTrainedModel(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
T5_PRETRAINED_MODEL_ARCHIVE_LIST = None
|
||||
|
||||
|
||||
|
0
tests/models/swinv2/__init__.py
Normal file
0
tests/models/swinv2/__init__.py
Normal file
430
tests/models/swinv2/test_modeling_swinv2.py
Normal file
430
tests/models/swinv2/test_modeling_swinv2.py
Normal file
@ -0,0 +1,430 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Testing suite for the PyTorch Swinv2 model. """
|
||||
import collections
|
||||
import inspect
|
||||
import unittest
|
||||
|
||||
from transformers import Swinv2Config
|
||||
from transformers.testing_utils import require_torch, require_vision, slow, torch_device
|
||||
from transformers.utils import cached_property, is_torch_available, is_vision_available
|
||||
|
||||
from ...test_configuration_common import ConfigTester
|
||||
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from transformers import Swinv2ForImageClassification, Swinv2ForMaskedImageModeling, Swinv2Model
|
||||
from transformers.models.swinv2.modeling_swinv2 import SWINV2_PRETRAINED_MODEL_ARCHIVE_LIST
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import AutoFeatureExtractor
|
||||
|
||||
|
||||
class Swinv2ModelTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=13,
|
||||
image_size=32,
|
||||
patch_size=2,
|
||||
num_channels=3,
|
||||
embed_dim=16,
|
||||
depths=[1, 2, 1],
|
||||
num_heads=[2, 2, 4],
|
||||
window_size=2,
|
||||
mlp_ratio=2.0,
|
||||
qkv_bias=True,
|
||||
hidden_dropout_prob=0.0,
|
||||
attention_probs_dropout_prob=0.0,
|
||||
drop_path_rate=0.1,
|
||||
hidden_act="gelu",
|
||||
use_absolute_embeddings=False,
|
||||
patch_norm=True,
|
||||
initializer_range=0.02,
|
||||
layer_norm_eps=1e-5,
|
||||
is_training=True,
|
||||
scope=None,
|
||||
use_labels=True,
|
||||
type_sequence_label_size=10,
|
||||
encoder_stride=8,
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.image_size = image_size
|
||||
self.patch_size = patch_size
|
||||
self.num_channels = num_channels
|
||||
self.embed_dim = embed_dim
|
||||
self.depths = depths
|
||||
self.num_heads = num_heads
|
||||
self.window_size = window_size
|
||||
self.mlp_ratio = mlp_ratio
|
||||
self.qkv_bias = qkv_bias
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.drop_path_rate = drop_path_rate
|
||||
self.hidden_act = hidden_act
|
||||
self.use_absolute_embeddings = use_absolute_embeddings
|
||||
self.patch_norm = patch_norm
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.initializer_range = initializer_range
|
||||
self.is_training = is_training
|
||||
self.scope = scope
|
||||
self.use_labels = use_labels
|
||||
self.type_sequence_label_size = type_sequence_label_size
|
||||
self.encoder_stride = encoder_stride
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
|
||||
|
||||
labels = None
|
||||
if self.use_labels:
|
||||
labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||
|
||||
config = self.get_config()
|
||||
|
||||
return config, pixel_values, labels
|
||||
|
||||
def get_config(self):
|
||||
return Swinv2Config(
|
||||
image_size=self.image_size,
|
||||
patch_size=self.patch_size,
|
||||
num_channels=self.num_channels,
|
||||
embed_dim=self.embed_dim,
|
||||
depths=self.depths,
|
||||
num_heads=self.num_heads,
|
||||
window_size=self.window_size,
|
||||
mlp_ratio=self.mlp_ratio,
|
||||
qkv_bias=self.qkv_bias,
|
||||
hidden_dropout_prob=self.hidden_dropout_prob,
|
||||
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||
drop_path_rate=self.drop_path_rate,
|
||||
hidden_act=self.hidden_act,
|
||||
use_absolute_embeddings=self.use_absolute_embeddings,
|
||||
path_norm=self.patch_norm,
|
||||
layer_norm_eps=self.layer_norm_eps,
|
||||
initializer_range=self.initializer_range,
|
||||
encoder_stride=self.encoder_stride,
|
||||
)
|
||||
|
||||
def create_and_check_model(self, config, pixel_values, labels):
|
||||
model = Swinv2Model(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
result = model(pixel_values)
|
||||
|
||||
expected_seq_len = ((config.image_size // config.patch_size) ** 2) // (4 ** (len(config.depths) - 1))
|
||||
expected_dim = int(config.embed_dim * 2 ** (len(config.depths) - 1))
|
||||
|
||||
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, expected_dim))
|
||||
|
||||
def create_and_check_for_masked_image_modeling(self, config, pixel_values, labels):
|
||||
model = Swinv2ForMaskedImageModeling(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
result = model(pixel_values)
|
||||
self.parent.assertEqual(
|
||||
result.logits.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size)
|
||||
)
|
||||
|
||||
# test greyscale images
|
||||
config.num_channels = 1
|
||||
model = Swinv2ForMaskedImageModeling(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
|
||||
result = model(pixel_values)
|
||||
self.parent.assertEqual(result.logits.shape, (self.batch_size, 1, self.image_size, self.image_size))
|
||||
|
||||
def create_and_check_for_image_classification(self, config, pixel_values, labels):
|
||||
config.num_labels = self.type_sequence_label_size
|
||||
model = Swinv2ForImageClassification(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
result = model(pixel_values, labels=labels)
|
||||
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
config, pixel_values, labels = config_and_inputs
|
||||
inputs_dict = {"pixel_values": pixel_values}
|
||||
return config, inputs_dict
|
||||
|
||||
|
||||
@require_torch
|
||||
class Swinv2ModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
|
||||
all_model_classes = (
|
||||
(Swinv2Model, Swinv2ForImageClassification, Swinv2ForMaskedImageModeling) if is_torch_available() else ()
|
||||
)
|
||||
|
||||
fx_compatible = False
|
||||
test_pruning = False
|
||||
test_resize_embeddings = False
|
||||
test_head_masking = False
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = Swinv2ModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=Swinv2Config, embed_dim=37)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.create_and_test_config_to_json_string()
|
||||
self.config_tester.create_and_test_config_to_json_file()
|
||||
self.config_tester.create_and_test_config_from_and_save_pretrained()
|
||||
self.config_tester.create_and_test_config_with_num_labels()
|
||||
self.config_tester.check_config_can_be_init_without_params()
|
||||
self.config_tester.check_config_arguments_init()
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
@unittest.skip(reason="Swinv2 does not use inputs_embeds")
|
||||
def test_inputs_embeds(self):
|
||||
pass
|
||||
|
||||
def test_model_common_attributes(self):
|
||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
|
||||
x = model.get_output_embeddings()
|
||||
self.assertTrue(x is None or isinstance(x, nn.Linear))
|
||||
|
||||
def test_forward_signature(self):
|
||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
signature = inspect.signature(model.forward)
|
||||
# signature.parameters is an OrderedDict => so arg_names order is deterministic
|
||||
arg_names = [*signature.parameters.keys()]
|
||||
|
||||
expected_arg_names = ["pixel_values"]
|
||||
self.assertListEqual(arg_names[:1], expected_arg_names)
|
||||
|
||||
def test_attention_outputs(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
config.return_dict = True
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
inputs_dict["output_attentions"] = True
|
||||
inputs_dict["output_hidden_states"] = False
|
||||
config.return_dict = True
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
|
||||
attentions = outputs.attentions
|
||||
expected_num_attentions = len(self.model_tester.depths)
|
||||
self.assertEqual(len(attentions), expected_num_attentions)
|
||||
|
||||
# check that output_attentions also work using config
|
||||
del inputs_dict["output_attentions"]
|
||||
config.output_attentions = True
|
||||
window_size_squared = config.window_size**2
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
|
||||
attentions = outputs.attentions
|
||||
self.assertEqual(len(attentions), expected_num_attentions)
|
||||
|
||||
self.assertListEqual(
|
||||
list(attentions[0].shape[-3:]),
|
||||
[self.model_tester.num_heads[0], window_size_squared, window_size_squared],
|
||||
)
|
||||
out_len = len(outputs)
|
||||
|
||||
# Check attention is always last and order is fine
|
||||
inputs_dict["output_attentions"] = True
|
||||
inputs_dict["output_hidden_states"] = True
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
|
||||
|
||||
if hasattr(self.model_tester, "num_hidden_states_types"):
|
||||
added_hidden_states = self.model_tester.num_hidden_states_types
|
||||
else:
|
||||
# also another +1 for reshaped_hidden_states
|
||||
added_hidden_states = 2
|
||||
self.assertEqual(out_len + added_hidden_states, len(outputs))
|
||||
|
||||
self_attentions = outputs.attentions
|
||||
|
||||
self.assertEqual(len(self_attentions), expected_num_attentions)
|
||||
|
||||
self.assertListEqual(
|
||||
list(self_attentions[0].shape[-3:]),
|
||||
[self.model_tester.num_heads[0], window_size_squared, window_size_squared],
|
||||
)
|
||||
|
||||
def check_hidden_states_output(self, inputs_dict, config, model_class, image_size):
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
|
||||
|
||||
hidden_states = outputs.hidden_states
|
||||
|
||||
expected_num_layers = getattr(
|
||||
self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1
|
||||
)
|
||||
self.assertEqual(len(hidden_states), expected_num_layers)
|
||||
|
||||
# Swinv2 has a different seq_length
|
||||
patch_size = (
|
||||
config.patch_size
|
||||
if isinstance(config.patch_size, collections.abc.Iterable)
|
||||
else (config.patch_size, config.patch_size)
|
||||
)
|
||||
|
||||
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
|
||||
|
||||
self.assertListEqual(
|
||||
list(hidden_states[0].shape[-2:]),
|
||||
[num_patches, self.model_tester.embed_dim],
|
||||
)
|
||||
|
||||
reshaped_hidden_states = outputs.reshaped_hidden_states
|
||||
self.assertEqual(len(reshaped_hidden_states), expected_num_layers)
|
||||
|
||||
batch_size, num_channels, height, width = reshaped_hidden_states[0].shape
|
||||
reshaped_hidden_states = (
|
||||
reshaped_hidden_states[0].view(batch_size, num_channels, height * width).permute(0, 2, 1)
|
||||
)
|
||||
self.assertListEqual(
|
||||
list(reshaped_hidden_states.shape[-2:]),
|
||||
[num_patches, self.model_tester.embed_dim],
|
||||
)
|
||||
|
||||
def test_hidden_states_output(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
image_size = (
|
||||
self.model_tester.image_size
|
||||
if isinstance(self.model_tester.image_size, collections.abc.Iterable)
|
||||
else (self.model_tester.image_size, self.model_tester.image_size)
|
||||
)
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
inputs_dict["output_hidden_states"] = True
|
||||
self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
|
||||
|
||||
# check that output_hidden_states also work using config
|
||||
del inputs_dict["output_hidden_states"]
|
||||
config.output_hidden_states = True
|
||||
|
||||
self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
|
||||
|
||||
def test_hidden_states_output_with_padding(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
config.patch_size = 3
|
||||
|
||||
image_size = (
|
||||
self.model_tester.image_size
|
||||
if isinstance(self.model_tester.image_size, collections.abc.Iterable)
|
||||
else (self.model_tester.image_size, self.model_tester.image_size)
|
||||
)
|
||||
patch_size = (
|
||||
config.patch_size
|
||||
if isinstance(config.patch_size, collections.abc.Iterable)
|
||||
else (config.patch_size, config.patch_size)
|
||||
)
|
||||
|
||||
padded_height = image_size[0] + patch_size[0] - (image_size[0] % patch_size[0])
|
||||
padded_width = image_size[1] + patch_size[1] - (image_size[1] % patch_size[1])
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
inputs_dict["output_hidden_states"] = True
|
||||
self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width))
|
||||
|
||||
# check that output_hidden_states also work using config
|
||||
del inputs_dict["output_hidden_states"]
|
||||
config.output_hidden_states = True
|
||||
self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width))
|
||||
|
||||
def test_for_masked_image_modeling(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
|
||||
|
||||
def test_for_image_classification(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
|
||||
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
for model_name in SWINV2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
|
||||
model = Swinv2Model.from_pretrained(model_name)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
def test_initialization(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
configs_no_init = _config_zero_init(config)
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config=configs_no_init)
|
||||
for name, param in model.named_parameters():
|
||||
if "embeddings" not in name and "logit_scale" not in name and param.requires_grad:
|
||||
self.assertIn(
|
||||
((param.data.mean() * 1e9).round() / 1e9).item(),
|
||||
[0.0, 1.0],
|
||||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||
)
|
||||
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
class Swinv2ModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return (
|
||||
AutoFeatureExtractor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
|
||||
if is_vision_available()
|
||||
else None
|
||||
)
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
model = Swinv2ForImageClassification.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256").to(
|
||||
torch_device
|
||||
)
|
||||
feature_extractor = self.default_feature_extractor
|
||||
|
||||
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs)
|
||||
|
||||
# verify the logits
|
||||
expected_shape = torch.Size((1, 1000))
|
||||
self.assertEqual(outputs.logits.shape, expected_shape)
|
||||
expected_slice = torch.tensor([-0.3947, -0.4306, 0.0026]).to(torch_device)
|
||||
self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
|
Loading…
Reference in New Issue
Block a user