mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
Add nvidia megatron models (#10911)
* Add support for NVIDIA Megatron models * Add support for NVIDIA Megatron GPT2 and BERT Add the megatron_gpt2 model. That model reuses the existing GPT2 model. This commit includes a script to convert a Megatron-GPT2 checkpoint downloaded from NVIDIA GPU Cloud. See examples/megatron-models/README.md for details. Add the megatron_bert model. That model is implemented as a modification of the existing BERT model in Transformers. This commit includes a script to convert a Megatron-BERT checkpoint downloaded from NVIDIA GPU Cloud. See examples/megatron-models/README.md for details. * Update src/transformers/models/megatron_bert/configuration_megatron_bert.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Update src/transformers/models/megatron_bert/configuration_megatron_bert.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Update src/transformers/models/megatron_bert/configuration_megatron_bert.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Remove model.half in tests + add "# Copied ..." Remove the model.half() instruction which makes tests fail on the CPU. Add a comment "# Copied ..." before many classes in the model to enable automatic tracking in CI between the new Megatron classes and the original Bert ones. * Fix issues * Fix Flax/TF tests * Fix copyright * Update src/transformers/models/megatron_bert/configuration_megatron_bert.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Update src/transformers/models/megatron_bert/configuration_megatron_bert.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Update src/transformers/models/megatron_bert/modeling_megatron_bert.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Update src/transformers/models/megatron_bert/modeling_megatron_bert.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Update src/transformers/models/megatron_bert/modeling_megatron_bert.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Update src/transformers/models/megatron_bert/modeling_megatron_bert.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Update docs/source/model_doc/megatron_bert.rst Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update docs/source/model_doc/megatron_gpt2.rst Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/megatron_bert/__init__.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/megatron_bert/modeling_megatron_bert.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/megatron_bert/modeling_megatron_bert.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/megatron_bert/modeling_megatron_bert.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/megatron_bert/modeling_megatron_bert.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/megatron_bert/modeling_megatron_bert.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/megatron_bert/modeling_megatron_bert.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/megatron_bert/modeling_megatron_bert.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/megatron_bert/modeling_megatron_bert.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/megatron_bert/modeling_megatron_bert.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/megatron_bert/modeling_megatron_bert.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/megatron_bert/modeling_megatron_bert.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/megatron_bert/modeling_megatron_bert.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Resolve most of 'sgugger' comments * Fix conversion issue + Run make fix-copies/quality/docs * Apply suggestions from code review * Causal LM & merge * Fix init * Add CausalLM to last auto class Co-authored-by: Julien Demouth <jdemouth@nvidia.com> Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr>
This commit is contained in:
parent
c6d664849b
commit
02ec02d6d3
@ -223,6 +223,8 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
|
||||
1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
|
||||
1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
||||
1. **[MBart-50](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
|
||||
1. **[Megatron-BERT](https://huggingface.co/transformers/model_doc/megatron_bert.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
|
||||
1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
|
||||
1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
||||
|
@ -178,58 +178,64 @@ and conversion utilities for the following models:
|
||||
32. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
|
||||
Multilingual Pretraining and Finetuning <https://arxiv.org/abs/2008.00401>`__ by Yuqing Tang, Chau Tran, Xian Li,
|
||||
Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
|
||||
33. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
|
||||
33. :doc:`Megatron-BERT <model_doc/megatron_bert>` (from NVIDIA) released with the paper `Megatron-LM: Training
|
||||
Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
|
||||
Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
34. :doc:`Megatron-GPT2 <model_doc/megatron_gpt2>` (from NVIDIA) released with the paper `Megatron-LM: Training
|
||||
Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
|
||||
Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
35. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
|
||||
Pre-training for Language Understanding <https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin,
|
||||
Jianfeng Lu, Tie-Yan Liu.
|
||||
34. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
|
||||
36. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
|
||||
text-to-text transformer <https://arxiv.org/abs/2010.11934>`__ by Linting Xue, Noah Constant, Adam Roberts, Mihir
|
||||
Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
|
||||
35. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
|
||||
37. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
|
||||
Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__> by Jingqing Zhang, Yao Zhao,
|
||||
Mohammad Saleh and Peter J. Liu.
|
||||
36. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
|
||||
38. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
|
||||
Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi,
|
||||
Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||
37. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
|
||||
39. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
|
||||
Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
|
||||
38. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
|
||||
40. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
|
||||
Pretraining Approach <https://arxiv.org/abs/1907.11692>`__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
|
||||
Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||
39. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
|
||||
41. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
|
||||
`fairseq S2T: Fast Speech-to-Text Modeling with fairseq <https://arxiv.org/abs/2010.05171>`__ by Changhan Wang, Yun
|
||||
Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
|
||||
40. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
|
||||
42. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
|
||||
about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola, Albert E. Shaw, Ravi
|
||||
Krishna, and Kurt W. Keutzer.
|
||||
41. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
|
||||
43. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
|
||||
Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
|
||||
Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
42. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
|
||||
44. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
|
||||
Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
|
||||
Francesco Piccinno and Julian Martin Eisenschlos.
|
||||
43. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
|
||||
45. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
|
||||
Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
|
||||
Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
||||
44. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
|
||||
46. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
|
||||
Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`__ by Alexey Dosovitskiy,
|
||||
Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias
|
||||
Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
45. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
|
||||
47. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
|
||||
Self-Supervised Learning of Speech Representations <https://arxiv.org/abs/2006.11477>`__ by Alexei Baevski, Henry
|
||||
Zhou, Abdelrahman Mohamed, Michael Auli.
|
||||
46. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
|
||||
48. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
|
||||
Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
|
||||
47. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
|
||||
49. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
|
||||
Predicting Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan,
|
||||
Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||
48. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
|
||||
50. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
|
||||
Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau*, Kartikay
|
||||
Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
|
||||
Zettlemoyer and Veselin Stoyanov.
|
||||
49. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
|
||||
51. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
|
||||
Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang*, Zihang Dai*, Yiming
|
||||
Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||
50. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
|
||||
52. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
|
||||
Cross-Lingual Representation Learning For Speech Recognition <https://arxiv.org/abs/2006.13979>`__ by Alexis
|
||||
Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
|
||||
|
||||
@ -304,6 +310,8 @@ TensorFlow and/or Flax.
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| Marian | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| MegatronBert | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| MobileBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| OpenAI GPT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
@ -449,6 +457,8 @@ TensorFlow and/or Flax.
|
||||
model_doc/marian
|
||||
model_doc/m2m_100
|
||||
model_doc/mbart
|
||||
model_doc/megatron_bert
|
||||
model_doc/megatron_gpt2
|
||||
model_doc/mobilebert
|
||||
model_doc/mpnet
|
||||
model_doc/mt5
|
||||
|
153
docs/source/model_doc/megatron_bert.rst
Normal file
153
docs/source/model_doc/megatron_bert.rst
Normal file
@ -0,0 +1,153 @@
|
||||
..
|
||||
Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
MegatronBERT
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
Overview
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The MegatronBERT model was proposed in `Megatron-LM: Training Multi-Billion Parameter Language Models Using Model
|
||||
Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley,
|
||||
Jared Casper and Bryan Catanzaro.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Recent work in language modeling demonstrates that training large transformer models advances the state of the art in
|
||||
Natural Language Processing applications. However, very large models can be quite difficult to train due to memory
|
||||
constraints. In this work, we present our techniques for training very large transformer models and implement a simple,
|
||||
efficient intra-layer model parallel approach that enables training transformer models with billions of parameters. Our
|
||||
approach does not require a new compiler or library changes, is orthogonal and complimentary to pipeline model
|
||||
parallelism, and can be fully implemented with the insertion of a few communication operations in native PyTorch. We
|
||||
illustrate this approach by converging transformer based models up to 8.3 billion parameters using 512 GPUs. We sustain
|
||||
15.1 PetaFLOPs across the entire application with 76% scaling efficiency when compared to a strong single GPU baseline
|
||||
that sustains 39 TeraFLOPs, which is 30% of peak FLOPs. To demonstrate that large language models can further advance
|
||||
the state of the art (SOTA), we train an 8.3 billion parameter transformer language model similar to GPT-2 and a 3.9
|
||||
billion parameter model similar to BERT. We show that careful attention to the placement of layer normalization in
|
||||
BERT-like models is critical to achieving increased performance as the model size grows. Using the GPT-2 model we
|
||||
achieve SOTA results on the WikiText103 (10.8 compared to SOTA perplexity of 15.8) and LAMBADA (66.5% compared to SOTA
|
||||
accuracy of 63.2%) datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9% compared to SOTA accuracy
|
||||
of 89.4%).*
|
||||
|
||||
Tips:
|
||||
|
||||
We have provided pretrained `BERT-345M <https://ngc.nvidia.com/catalog/models/nvidia:megatron_bert_345m>`__ checkpoints
|
||||
for use to evaluate or finetuning downstream tasks.
|
||||
|
||||
To access these checkpoints, first `sign up <https://ngc.nvidia.com/signup>`__ for and setup the NVIDIA GPU Cloud (NGC)
|
||||
Registry CLI. Further documentation for downloading models can be found in the `NGC documentation
|
||||
<https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1>`__.
|
||||
|
||||
Alternatively, you can directly download the checkpoints using:
|
||||
|
||||
BERT-345M-uncased::
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip
|
||||
-O megatron_bert_345m_v0_1_uncased.zip
|
||||
|
||||
BERT-345M-cased::
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O
|
||||
megatron_bert_345m_v0_1_cased.zip
|
||||
|
||||
Once you have obtained the checkpoints from NVIDIA GPU Cloud (NGC), you have to convert them to a format that will
|
||||
easily be loaded by Hugging Face Transformers and our port of the BERT code.
|
||||
|
||||
The following commands allow you to do the conversion. We assume that the folder ``models/megatron_bert`` contains
|
||||
``megatron_bert_345m_v0_1_{cased, uncased}.zip`` and that the commands are run from inside that folder::
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_uncased.zip
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_cased.zip
|
||||
|
||||
The original code can be found `here <https://github.com/NVIDIA/Megatron-LM>`__. That repository contains a multi-GPU
|
||||
and multi-node implementation of the Megatron Language models. In particular, it contains a hybrid model parallel
|
||||
approach using "tensor parallel" and "pipeline parallel" techniques.
|
||||
|
||||
MegatronBertConfig
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.MegatronBertConfig
|
||||
:members:
|
||||
|
||||
|
||||
MegatronBertModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.MegatronBertModel
|
||||
:members: forward
|
||||
|
||||
|
||||
MegatronBertForMaskedLM
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.MegatronBertForMaskedLM
|
||||
:members: forward
|
||||
|
||||
|
||||
MegatronBertForCausalLM
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.MegatronBertForCausalLM
|
||||
:members: forward
|
||||
|
||||
|
||||
MegatronBertForNextSentencePrediction
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.MegatronBertForNextSentencePrediction
|
||||
:members: forward
|
||||
|
||||
|
||||
MegatronBertForPreTraining
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.MegatronBertForPreTraining
|
||||
:members: forward
|
||||
|
||||
|
||||
MegatronBertForSequenceClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.MegatronBertForSequenceClassification
|
||||
:members: forward
|
||||
|
||||
|
||||
MegatronBertForMultipleChoice
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.MegatronBertForMultipleChoice
|
||||
:members: forward
|
||||
|
||||
|
||||
MegatronBertForTokenClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.MegatronBertForTokenClassification
|
||||
:members: forward
|
||||
|
||||
|
||||
MegatronBertForQuestionAnswering
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.MegatronBertForQuestionAnswering
|
||||
:members: forward
|
||||
|
||||
|
70
docs/source/model_doc/megatron_gpt2.rst
Normal file
70
docs/source/model_doc/megatron_gpt2.rst
Normal file
@ -0,0 +1,70 @@
|
||||
..
|
||||
Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
MegatronGPT2
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
Overview
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The MegatronGPT2 model was proposed in `Megatron-LM: Training Multi-Billion Parameter Language Models Using Model
|
||||
Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley,
|
||||
Jared Casper and Bryan Catanzaro.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Recent work in language modeling demonstrates that training large transformer models advances the state of the art in
|
||||
Natural Language Processing applications. However, very large models can be quite difficult to train due to memory
|
||||
constraints. In this work, we present our techniques for training very large transformer models and implement a simple,
|
||||
efficient intra-layer model parallel approach that enables training transformer models with billions of parameters. Our
|
||||
approach does not require a new compiler or library changes, is orthogonal and complimentary to pipeline model
|
||||
parallelism, and can be fully implemented with the insertion of a few communication operations in native PyTorch. We
|
||||
illustrate this approach by converging transformer based models up to 8.3 billion parameters using 512 GPUs. We sustain
|
||||
15.1 PetaFLOPs across the entire application with 76% scaling efficiency when compared to a strong single GPU baseline
|
||||
that sustains 39 TeraFLOPs, which is 30% of peak FLOPs. To demonstrate that large language models can further advance
|
||||
the state of the art (SOTA), we train an 8.3 billion parameter transformer language model similar to GPT-2 and a 3.9
|
||||
billion parameter model similar to BERT. We show that careful attention to the placement of layer normalization in
|
||||
BERT-like models is critical to achieving increased performance as the model size grows. Using the GPT-2 model we
|
||||
achieve SOTA results on the WikiText103 (10.8 compared to SOTA perplexity of 15.8) and LAMBADA (66.5% compared to SOTA
|
||||
accuracy of 63.2%) datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9% compared to SOTA accuracy
|
||||
of 89.4%).*
|
||||
|
||||
Tips:
|
||||
|
||||
We have provided pretrained `GPT2-345M <https://ngc.nvidia.com/catalog/models/nvidia:megatron_lm_345m>`__ checkpoints
|
||||
for use to evaluate or finetuning downstream tasks.
|
||||
|
||||
To access these checkpoints, first `sign up <https://ngc.nvidia.com/signup>`__ for and setup the NVIDIA GPU Cloud (NGC)
|
||||
Registry CLI. Further documentation for downloading models can be found in the `NGC documentation
|
||||
<https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1>`__.
|
||||
|
||||
Alternatively, you can directly download the checkpoints using::
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O
|
||||
megatron_gpt2_345m_v0_0.zip
|
||||
|
||||
Once you have obtained the checkpoint from NVIDIA GPU Cloud (NGC), you have to convert it to a format that will easily
|
||||
be loaded by Hugging Face Transformers GPT2 implementation.
|
||||
|
||||
The following command allows you to do the conversion. We assume that the folder ``models/megatron_gpt2`` contains
|
||||
``megatron_gpt2_345m_v0_0.zip`` and that the command is run from that folder::
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python3 $PATH_TO_TRANSFORMERS/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py megatron_gpt2_345m_v0_0.zip
|
||||
|
||||
The original code can be found `here <https://github.com/NVIDIA/Megatron-LM>`__. That repository contains a multi-GPU
|
||||
and multi-node implementation of the Megatron Language models. In particular, it contains a hybrid model parallel
|
||||
approach using "tensor parallel" and "pipeline parallel" techniques.
|
||||
|
@ -191,6 +191,7 @@ _import_structure = {
|
||||
"models.m2m_100": ["M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP", "M2M100Config"],
|
||||
"models.marian": ["MarianConfig"],
|
||||
"models.mbart": ["MBartConfig"],
|
||||
"models.megatron_bert": ["MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MegatronBertConfig"],
|
||||
"models.mmbt": ["MMBTConfig"],
|
||||
"models.mobilebert": ["MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileBertConfig", "MobileBertTokenizer"],
|
||||
"models.mpnet": ["MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "MPNetConfig", "MPNetTokenizer"],
|
||||
@ -765,6 +766,20 @@ if is_torch_available():
|
||||
"MBartModel",
|
||||
]
|
||||
)
|
||||
_import_structure["models.megatron_bert"].extend(
|
||||
[
|
||||
"MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
"MegatronBertForCausalLM",
|
||||
"MegatronBertForMaskedLM",
|
||||
"MegatronBertForMultipleChoice",
|
||||
"MegatronBertForNextSentencePrediction",
|
||||
"MegatronBertForPreTraining",
|
||||
"MegatronBertForQuestionAnswering",
|
||||
"MegatronBertForSequenceClassification",
|
||||
"MegatronBertForTokenClassification",
|
||||
"MegatronBertModel",
|
||||
]
|
||||
)
|
||||
_import_structure["models.mmbt"].extend(["MMBTForClassification", "MMBTModel", "ModalEmbeddings"])
|
||||
_import_structure["models.mobilebert"].extend(
|
||||
[
|
||||
@ -1514,6 +1529,7 @@ if TYPE_CHECKING:
|
||||
from .models.m2m_100 import M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP, M2M100Config
|
||||
from .models.marian import MarianConfig
|
||||
from .models.mbart import MBartConfig
|
||||
from .models.megatron_bert import MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MegatronBertConfig
|
||||
from .models.mmbt import MMBTConfig
|
||||
from .models.mobilebert import MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileBertConfig, MobileBertTokenizer
|
||||
from .models.mpnet import MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP, MPNetConfig, MPNetTokenizer
|
||||
@ -1999,6 +2015,18 @@ if TYPE_CHECKING:
|
||||
MBartForSequenceClassification,
|
||||
MBartModel,
|
||||
)
|
||||
from .models.megatron_bert import (
|
||||
MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
MegatronBertForCausalLM,
|
||||
MegatronBertForMaskedLM,
|
||||
MegatronBertForMultipleChoice,
|
||||
MegatronBertForNextSentencePrediction,
|
||||
MegatronBertForPreTraining,
|
||||
MegatronBertForQuestionAnswering,
|
||||
MegatronBertForSequenceClassification,
|
||||
MegatronBertForTokenClassification,
|
||||
MegatronBertModel,
|
||||
)
|
||||
from .models.mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
|
||||
from .models.mobilebert import (
|
||||
MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
|
@ -50,6 +50,7 @@ from . import (
|
||||
m2m_100,
|
||||
marian,
|
||||
mbart,
|
||||
megatron_bert,
|
||||
mmbt,
|
||||
mobilebert,
|
||||
mpnet,
|
||||
|
@ -50,6 +50,7 @@ from ..lxmert.configuration_lxmert import LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
from ..m2m_100.configuration_m2m_100 import M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP, M2M100Config
|
||||
from ..marian.configuration_marian import MarianConfig
|
||||
from ..mbart.configuration_mbart import MBART_PRETRAINED_CONFIG_ARCHIVE_MAP, MBartConfig
|
||||
from ..megatron_bert.configuration_megatron_bert import MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MegatronBertConfig
|
||||
from ..mobilebert.configuration_mobilebert import MobileBertConfig
|
||||
from ..mpnet.configuration_mpnet import MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP, MPNetConfig
|
||||
from ..mt5.configuration_mt5 import MT5Config
|
||||
@ -85,6 +86,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
|
||||
# Add archive maps here
|
||||
GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
VIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
@ -155,6 +157,7 @@ CONFIG_MAPPING = OrderedDict(
|
||||
("pegasus", PegasusConfig),
|
||||
("marian", MarianConfig),
|
||||
("mbart", MBartConfig),
|
||||
("megatron_bert", MegatronBertConfig),
|
||||
("mpnet", MPNetConfig),
|
||||
("bart", BartConfig),
|
||||
("blenderbot", BlenderbotConfig),
|
||||
@ -211,6 +214,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
|
||||
("blenderbot", "Blenderbot"),
|
||||
("marian", "Marian"),
|
||||
("mbart", "mBART"),
|
||||
("megatron_bert", "MegatronBert"),
|
||||
("bart", "BART"),
|
||||
("reformer", "Reformer"),
|
||||
("longformer", "Longformer"),
|
||||
|
@ -174,6 +174,17 @@ from ..mbart.modeling_mbart import (
|
||||
MBartForSequenceClassification,
|
||||
MBartModel,
|
||||
)
|
||||
from ..megatron_bert.modeling_megatron_bert import (
|
||||
MegatronBertForCausalLM,
|
||||
MegatronBertForMaskedLM,
|
||||
MegatronBertForMultipleChoice,
|
||||
MegatronBertForNextSentencePrediction,
|
||||
MegatronBertForPreTraining,
|
||||
MegatronBertForQuestionAnswering,
|
||||
MegatronBertForSequenceClassification,
|
||||
MegatronBertForTokenClassification,
|
||||
MegatronBertModel,
|
||||
)
|
||||
from ..mobilebert.modeling_mobilebert import (
|
||||
MobileBertForMaskedLM,
|
||||
MobileBertForMultipleChoice,
|
||||
@ -298,6 +309,7 @@ from .configuration_auto import (
|
||||
M2M100Config,
|
||||
MarianConfig,
|
||||
MBartConfig,
|
||||
MegatronBertConfig,
|
||||
MobileBertConfig,
|
||||
MPNetConfig,
|
||||
MT5Config,
|
||||
@ -355,6 +367,7 @@ MODEL_MAPPING = OrderedDict(
|
||||
(BertConfig, BertModel),
|
||||
(OpenAIGPTConfig, OpenAIGPTModel),
|
||||
(GPT2Config, GPT2Model),
|
||||
(MegatronBertConfig, MegatronBertModel),
|
||||
(MobileBertConfig, MobileBertModel),
|
||||
(TransfoXLConfig, TransfoXLModel),
|
||||
(XLNetConfig, XLNetModel),
|
||||
@ -398,6 +411,7 @@ MODEL_FOR_PRETRAINING_MAPPING = OrderedDict(
|
||||
(BigBirdConfig, BigBirdForPreTraining),
|
||||
(OpenAIGPTConfig, OpenAIGPTLMHeadModel),
|
||||
(GPT2Config, GPT2LMHeadModel),
|
||||
(MegatronBertConfig, MegatronBertForPreTraining),
|
||||
(MobileBertConfig, MobileBertForPreTraining),
|
||||
(TransfoXLConfig, TransfoXLLMHeadModel),
|
||||
(XLNetConfig, XLNetLMHeadModel),
|
||||
@ -441,6 +455,7 @@ MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
|
||||
(BertConfig, BertForMaskedLM),
|
||||
(OpenAIGPTConfig, OpenAIGPTLMHeadModel),
|
||||
(GPT2Config, GPT2LMHeadModel),
|
||||
(MegatronBertConfig, MegatronBertForMaskedLM),
|
||||
(MobileBertConfig, MobileBertForMaskedLM),
|
||||
(TransfoXLConfig, TransfoXLLMHeadModel),
|
||||
(XLNetConfig, XLNetLMHeadModel),
|
||||
@ -456,6 +471,7 @@ MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
|
||||
(DebertaConfig, DebertaForMaskedLM),
|
||||
(DebertaV2Config, DebertaV2ForMaskedLM),
|
||||
(IBertConfig, IBertForMaskedLM),
|
||||
(MegatronBertConfig, MegatronBertForCausalLM),
|
||||
]
|
||||
)
|
||||
|
||||
@ -487,6 +503,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING = OrderedDict(
|
||||
(MarianConfig, MarianForCausalLM),
|
||||
(BlenderbotConfig, BlenderbotForCausalLM),
|
||||
(BlenderbotSmallConfig, BlenderbotSmallForCausalLM),
|
||||
(MegatronBertConfig, MegatronBertForCausalLM),
|
||||
]
|
||||
)
|
||||
|
||||
@ -514,6 +531,7 @@ MODEL_FOR_MASKED_LM_MAPPING = OrderedDict(
|
||||
(RobertaConfig, RobertaForMaskedLM),
|
||||
(SqueezeBertConfig, SqueezeBertForMaskedLM),
|
||||
(BertConfig, BertForMaskedLM),
|
||||
(MegatronBertConfig, MegatronBertForMaskedLM),
|
||||
(MobileBertConfig, MobileBertForMaskedLM),
|
||||
(FlaubertConfig, FlaubertWithLMHeadModel),
|
||||
(XLMConfig, XLMWithLMHeadModel),
|
||||
@ -566,6 +584,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(
|
||||
(LayoutLMConfig, LayoutLMForSequenceClassification),
|
||||
(BertConfig, BertForSequenceClassification),
|
||||
(XLNetConfig, XLNetForSequenceClassification),
|
||||
(MegatronBertConfig, MegatronBertForSequenceClassification),
|
||||
(MobileBertConfig, MobileBertForSequenceClassification),
|
||||
(FlaubertConfig, FlaubertForSequenceClassification),
|
||||
(XLMConfig, XLMForSequenceClassification),
|
||||
@ -602,6 +621,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
|
||||
(BertConfig, BertForQuestionAnswering),
|
||||
(XLNetConfig, XLNetForQuestionAnsweringSimple),
|
||||
(FlaubertConfig, FlaubertForQuestionAnsweringSimple),
|
||||
(MegatronBertConfig, MegatronBertForQuestionAnswering),
|
||||
(MobileBertConfig, MobileBertForQuestionAnswering),
|
||||
(XLMConfig, XLMForQuestionAnsweringSimple),
|
||||
(ElectraConfig, ElectraForQuestionAnswering),
|
||||
@ -637,6 +657,7 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
|
||||
(RobertaConfig, RobertaForTokenClassification),
|
||||
(SqueezeBertConfig, SqueezeBertForTokenClassification),
|
||||
(BertConfig, BertForTokenClassification),
|
||||
(MegatronBertConfig, MegatronBertForTokenClassification),
|
||||
(MobileBertConfig, MobileBertForTokenClassification),
|
||||
(XLNetConfig, XLNetForTokenClassification),
|
||||
(AlbertConfig, AlbertForTokenClassification),
|
||||
@ -663,6 +684,7 @@ MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
|
||||
(SqueezeBertConfig, SqueezeBertForMultipleChoice),
|
||||
(BertConfig, BertForMultipleChoice),
|
||||
(DistilBertConfig, DistilBertForMultipleChoice),
|
||||
(MegatronBertConfig, MegatronBertForMultipleChoice),
|
||||
(MobileBertConfig, MobileBertForMultipleChoice),
|
||||
(XLNetConfig, XLNetForMultipleChoice),
|
||||
(AlbertConfig, AlbertForMultipleChoice),
|
||||
@ -677,6 +699,7 @@ MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
|
||||
MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING = OrderedDict(
|
||||
[
|
||||
(BertConfig, BertForNextSentencePrediction),
|
||||
(MegatronBertConfig, MegatronBertForNextSentencePrediction),
|
||||
(MobileBertConfig, MobileBertForNextSentencePrediction),
|
||||
]
|
||||
)
|
||||
|
74
src/transformers/models/megatron_bert/__init__.py
Normal file
74
src/transformers/models/megatron_bert/__init__.py
Normal file
@ -0,0 +1,74 @@
|
||||
# flake8: noqa
|
||||
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
||||
# module, but to preserve other warnings. So, don't check this module at all.
|
||||
|
||||
# Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ...file_utils import _BaseLazyModule, is_torch_available
|
||||
|
||||
|
||||
_import_structure = {
|
||||
"configuration_megatron_bert": ["MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MegatronBertConfig"],
|
||||
}
|
||||
|
||||
if is_torch_available():
|
||||
_import_structure["modeling_megatron_bert"] = [
|
||||
"MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
"MegatronBertForCausalLM",
|
||||
"MegatronBertForMaskedLM",
|
||||
"MegatronBertForMultipleChoice",
|
||||
"MegatronBertForNextSentencePrediction",
|
||||
"MegatronBertForPreTraining",
|
||||
"MegatronBertForQuestionAnswering",
|
||||
"MegatronBertForSequenceClassification",
|
||||
"MegatronBertForTokenClassification",
|
||||
"MegatronBertModel",
|
||||
]
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_megatron_bert import MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MegatronBertConfig
|
||||
|
||||
if is_torch_available():
|
||||
from .modeling_megatron_bert import (
|
||||
MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
MegatronBertForCausalLM,
|
||||
MegatronBertForMaskedLM,
|
||||
MegatronBertForMultipleChoice,
|
||||
MegatronBertForNextSentencePrediction,
|
||||
MegatronBertForPreTraining,
|
||||
MegatronBertForQuestionAnswering,
|
||||
MegatronBertForSequenceClassification,
|
||||
MegatronBertForTokenClassification,
|
||||
MegatronBertModel,
|
||||
)
|
||||
|
||||
else:
|
||||
import importlib
|
||||
import os
|
||||
import sys
|
||||
|
||||
class _LazyModule(_BaseLazyModule):
|
||||
"""
|
||||
Module class that surfaces all objects but only performs associated imports when the objects are requested.
|
||||
"""
|
||||
|
||||
__file__ = globals()["__file__"]
|
||||
__path__ = [os.path.dirname(__file__)]
|
||||
|
||||
def _get_module(self, module_name: str):
|
||||
return importlib.import_module("." + module_name, self.__name__)
|
||||
|
||||
sys.modules[__name__] = _LazyModule(__name__, _import_structure)
|
@ -0,0 +1,132 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2021- NVIDIA Corporation and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" MEGATRON_BERT model configuration """
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
# See all MEGATRON_BERT models at https://huggingface.co/models?filter=bert
|
||||
}
|
||||
|
||||
|
||||
class MegatronBertConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.MegatronBertModel`. It is
|
||||
used to instantiate a MEGATRON_BERT model according to the specified arguments, defining the model architecture.
|
||||
Instantiating a configuration with the defaults will yield a similar configuration to that of the MEGATRON_BERT
|
||||
`megatron-bert-uncased-345m <https://huggingface.co/nvidia/megatron-bert-uncased-345m>`__ architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
|
||||
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, `optional`, defaults to 29056):
|
||||
Vocabulary size of the MEGATRON_BERT model. Defines the number of different tokens that can be represented
|
||||
by the :obj:`inputs_ids` passed when calling :class:`~transformers.MegatronBertModel`.
|
||||
hidden_size (:obj:`int`, `optional`, defaults to 1024):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
num_hidden_layers (:obj:`int`, `optional`, defaults to 24):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (:obj:`int`, `optional`, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
intermediate_size (:obj:`int`, `optional`, defaults to 4096):
|
||||
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
|
||||
hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
|
||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size (:obj:`int`, `optional`, defaults to 2):
|
||||
The vocabulary size of the :obj:`token_type_ids` passed when calling
|
||||
:class:`~transformers.MegatronBertModel`.
|
||||
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
|
||||
position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
|
||||
Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
|
||||
:obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
|
||||
:obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
|
||||
<https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
|
||||
`Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
|
||||
<https://arxiv.org/abs/2009.13658>`__.
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if ``config.is_decoder=True``.
|
||||
|
||||
Examples::
|
||||
|
||||
>>> from transformers import MegatronBertModel, MegatronBertConfig
|
||||
|
||||
>>> # Initializing a MEGATRON_BERT bert-base-uncased style configuration
|
||||
>>> configuration = MegatronBertConfig()
|
||||
|
||||
>>> # Initializing a model from the bert-base-uncased style configuration
|
||||
>>> model = MegatronBertModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
model_type = "megatron-bert"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=29056,
|
||||
hidden_size=1024,
|
||||
num_hidden_layers=24,
|
||||
num_attention_heads=16,
|
||||
intermediate_size=4096,
|
||||
hidden_act="gelu",
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
max_position_embeddings=512,
|
||||
type_vocab_size=2,
|
||||
initializer_range=0.02,
|
||||
layer_norm_eps=1e-12,
|
||||
pad_token_id=0,
|
||||
gradient_checkpointing=False,
|
||||
position_embedding_type="absolute",
|
||||
use_cache=True,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(pad_token_id=pad_token_id, **kwargs)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.intermediate_size = intermediate_size
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.type_vocab_size = type_vocab_size
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
self.position_embedding_type = position_embedding_type
|
||||
self.use_cache = use_cache
|
@ -0,0 +1,265 @@
|
||||
####################################################################################################
|
||||
|
||||
# Copyright (c) 2021-, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
####################################################################################################
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import zipfile
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
####################################################################################################
|
||||
|
||||
|
||||
def recursive_print(name, val, spaces=0):
|
||||
# Format the message.
|
||||
if name is None:
|
||||
msg = None
|
||||
else:
|
||||
fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}"
|
||||
msg = fmt.format(name)
|
||||
|
||||
# Print and recurse (if needed).
|
||||
if isinstance(val, dict):
|
||||
if msg is not None:
|
||||
print(msg)
|
||||
for k in val.keys():
|
||||
recursive_print(k, val[k], spaces + 2)
|
||||
elif isinstance(val, torch.Tensor):
|
||||
print(msg, ":", val.size())
|
||||
else:
|
||||
print(msg, ":", val)
|
||||
|
||||
|
||||
####################################################################################################
|
||||
|
||||
|
||||
def convert_megatron_checkpoint(args, input_state_dict):
|
||||
# The converted output model.
|
||||
output_state_dict = {}
|
||||
|
||||
# The model.
|
||||
model = input_state_dict["model"]
|
||||
# The language model.
|
||||
lm = model["language_model"]
|
||||
# The embeddings.
|
||||
embeddings = lm["embedding"]
|
||||
|
||||
# The word embeddings.
|
||||
word_embeddings = embeddings["word_embeddings"]["weight"]
|
||||
# Store the word embeddings.
|
||||
output_state_dict["bert.embeddings.word_embeddings.weight"] = word_embeddings
|
||||
|
||||
# The position embeddings.
|
||||
pos_embeddings = embeddings["position_embeddings"]["weight"]
|
||||
# Trained for 512 x 1024.
|
||||
assert pos_embeddings.size(0) == 512 and pos_embeddings.size(1) == 1024
|
||||
# Store the position embeddings.
|
||||
output_state_dict["bert.embeddings.position_embeddings.weight"] = pos_embeddings
|
||||
|
||||
# The token-type embeddings.
|
||||
tokentype_embeddings = embeddings["tokentype_embeddings"]["weight"]
|
||||
# Store the position embeddings.
|
||||
output_state_dict["bert.embeddings.token_type_embeddings.weight"] = tokentype_embeddings
|
||||
|
||||
# The transformer.
|
||||
transformer = lm["transformer"]
|
||||
|
||||
# The regex to extract layer names.
|
||||
layer_re = re.compile("layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
|
||||
|
||||
# The simple map of names for "automated" rules.
|
||||
megatron_to_transformers = {
|
||||
"attention.dense": ".attention.output.dense.",
|
||||
"mlp.dense_h_to_4h": ".intermediate.dense.",
|
||||
"mlp.dense_4h_to_h": ".output.dense.",
|
||||
}
|
||||
|
||||
# Keep track of the attention/query/value tensor.
|
||||
attention_qkv_weight = None
|
||||
|
||||
# Extract the layers.
|
||||
for key, val in transformer.items():
|
||||
# Match the name.
|
||||
m = layer_re.match(key)
|
||||
|
||||
# Stop if that's not a layer
|
||||
if m is None:
|
||||
break
|
||||
|
||||
# The index of the layer.
|
||||
layer_idx = int(m.group(1))
|
||||
# The name of the operation.
|
||||
op_name = m.group(2)
|
||||
# Is it a weight or a bias?
|
||||
weight_or_bias = m.group(3)
|
||||
|
||||
# The name of the layer.
|
||||
layer_name = f"bert.encoder.layer.{layer_idx}"
|
||||
|
||||
# For layernorm(s), simply store the layer norm.
|
||||
if op_name.endswith("layernorm"):
|
||||
|
||||
ln_name = "attention.ln" if op_name.startswith("input") else "ln"
|
||||
output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val
|
||||
|
||||
# Transpose the QKV matrix.
|
||||
elif op_name == "attention.query_key_value" and weight_or_bias == "weight":
|
||||
|
||||
# Make sure the QKV pointer is nil.
|
||||
assert attention_qkv_weight is None, ""
|
||||
|
||||
# Store the tensor as we need the bias as well to interleave QKV and biases.
|
||||
attention_qkv_weight = val
|
||||
|
||||
# Transpose the bias.
|
||||
elif op_name == "attention.query_key_value" and weight_or_bias == "bias":
|
||||
|
||||
# Make sure we read the weight tensor.
|
||||
assert attention_qkv_weight is not None, ""
|
||||
|
||||
# Split the QKV matrix into Q, K and V. Megatron stores Q,K,V interleaved.
|
||||
q = attention_qkv_weight[0 * 1024 : 1 * 1024, :]
|
||||
k = attention_qkv_weight[1 * 1024 : 2 * 1024, :]
|
||||
v = attention_qkv_weight[2 * 1024 : 3 * 1024, :]
|
||||
|
||||
# Split the bias.
|
||||
q_bias = val[0 * 1024 : 1 * 1024]
|
||||
k_bias = val[1 * 1024 : 2 * 1024]
|
||||
v_bias = val[2 * 1024 : 3 * 1024]
|
||||
|
||||
# Store.
|
||||
output_state_dict[f"{layer_name}.attention.self.query.weight"] = q
|
||||
output_state_dict[f"{layer_name}.attention.self.query.bias"] = q_bias
|
||||
output_state_dict[f"{layer_name}.attention.self.key.weight"] = k
|
||||
output_state_dict[f"{layer_name}.attention.self.key.bias"] = k_bias
|
||||
output_state_dict[f"{layer_name}.attention.self.value.weight"] = v
|
||||
output_state_dict[f"{layer_name}.attention.self.value.bias"] = v_bias
|
||||
|
||||
# Clear the stored tensor.
|
||||
attention_qkv_weight = None
|
||||
|
||||
# Copy weights and biases as is.
|
||||
elif weight_or_bias in ["weight", "bias"]:
|
||||
|
||||
out_name = megatron_to_transformers[op_name]
|
||||
output_state_dict[layer_name + out_name + weight_or_bias] = val
|
||||
|
||||
# The final layernorm.
|
||||
output_state_dict["bert.encoder.ln.weight"] = transformer["final_layernorm.weight"]
|
||||
output_state_dict["bert.encoder.ln.bias"] = transformer["final_layernorm.bias"]
|
||||
|
||||
# The config.
|
||||
output_config = {
|
||||
"vocab_size": word_embeddings.size(0),
|
||||
"hidden_size": 1024,
|
||||
"num_hidden_layers": 24,
|
||||
"num_attention_heads": 16,
|
||||
"hidden_act": "gelu_new",
|
||||
"intermediate_size": 4096,
|
||||
"hidden_dropout_prob": 0.1,
|
||||
"attention_probs_dropout_prob": 0.1,
|
||||
"max_position_embeddings": 512,
|
||||
"type_vocab_size": 2,
|
||||
"initializer_range": 0.2,
|
||||
"layer_norm_eps": 1e-12,
|
||||
"gradient_checkpointing": False,
|
||||
"position_embedding_type": "absolute",
|
||||
"use_cache": False,
|
||||
}
|
||||
|
||||
# The pooler.
|
||||
pooler = lm["pooler"]
|
||||
|
||||
# Store the matrix and the bias.
|
||||
output_state_dict["bert.pooler.dense.weight"] = pooler["dense.weight"]
|
||||
output_state_dict["bert.pooler.dense.bias"] = pooler["dense.bias"]
|
||||
|
||||
# The LM head from Megatron (for RACE).
|
||||
lm_head = model["lm_head"]
|
||||
|
||||
# The transform matrix.
|
||||
output_state_dict["cls.predictions.transform.dense.weight"] = lm_head["dense.weight"]
|
||||
output_state_dict["cls.predictions.transform.dense.bias"] = lm_head["dense.bias"]
|
||||
|
||||
# The transform LN.
|
||||
output_state_dict["cls.predictions.transform.LayerNorm.weight"] = lm_head["layernorm.weight"]
|
||||
output_state_dict["cls.predictions.transform.LayerNorm.bias"] = lm_head["layernorm.bias"]
|
||||
|
||||
# For the decoder, we replicate the weights.
|
||||
output_state_dict["cls.predictions.decoder.weight"] = word_embeddings
|
||||
output_state_dict["cls.predictions.bias"] = lm_head["bias"]
|
||||
|
||||
# The classifier from Megatron (for MLNI).
|
||||
binary_head = model["binary_head"]
|
||||
|
||||
# Store the classifier.
|
||||
output_state_dict["cls.seq_relationship.weight"] = binary_head["weight"]
|
||||
output_state_dict["cls.seq_relationship.bias"] = binary_head["bias"]
|
||||
|
||||
# It should be done!
|
||||
return output_state_dict, output_config
|
||||
|
||||
|
||||
####################################################################################################
|
||||
|
||||
|
||||
def main():
|
||||
# Create the argument parser.
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--print-checkpoint-structure", action="store_true")
|
||||
parser.add_argument("path_to_checkpoint", type=str, help="Path to the ZIP file containing the checkpoint")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Extract the basename.
|
||||
basename = os.path.dirname(args.path_to_checkpoint)
|
||||
|
||||
# Load the model.
|
||||
print(f'Extracting PyTorch state dictionary from "{args.path_to_checkpoint}"')
|
||||
with zipfile.ZipFile(args.path_to_checkpoint, "r") as checkpoint:
|
||||
with checkpoint.open("release/mp_rank_00/model_optim_rng.pt") as pytorch_dict:
|
||||
input_state_dict = torch.load(pytorch_dict, map_location="cpu")
|
||||
|
||||
# Convert.
|
||||
print("Converting")
|
||||
output_state_dict, output_config = convert_megatron_checkpoint(args, input_state_dict)
|
||||
|
||||
# Print the structure of converted state dict.
|
||||
if args.print_checkpoint_structure:
|
||||
recursive_print(None, output_state_dict)
|
||||
|
||||
# Store the config to file.
|
||||
output_config_file = os.path.join(basename, "config.json")
|
||||
print(f'Saving config to "{output_config_file}"')
|
||||
with open(output_config_file, "w") as f:
|
||||
json.dump(output_config, f)
|
||||
|
||||
# Store the state_dict to file.
|
||||
output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
|
||||
print(f'Saving checkpoint to "{output_checkpoint_file}"')
|
||||
torch.save(output_state_dict, output_checkpoint_file)
|
||||
|
||||
|
||||
####################################################################################################
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
####################################################################################################
|
1827
src/transformers/models/megatron_bert/modeling_megatron_bert.py
Executable file
1827
src/transformers/models/megatron_bert/modeling_megatron_bert.py
Executable file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,238 @@
|
||||
####################################################################################################
|
||||
|
||||
# Copyright (c) 2021-, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
####################################################################################################
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import zipfile
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
####################################################################################################
|
||||
|
||||
|
||||
def recursive_print(name, val, spaces=0):
|
||||
# Format the message.
|
||||
if name is None:
|
||||
msg = None
|
||||
else:
|
||||
fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}"
|
||||
msg = fmt.format(name)
|
||||
|
||||
# Print and recurse (if needed).
|
||||
if isinstance(val, dict):
|
||||
if msg is not None:
|
||||
print(msg)
|
||||
for k in val.keys():
|
||||
recursive_print(k, val[k], spaces + 2)
|
||||
elif isinstance(val, torch.Tensor):
|
||||
print(msg, ":", val.size())
|
||||
else:
|
||||
print(msg, ":", val)
|
||||
|
||||
|
||||
####################################################################################################
|
||||
|
||||
|
||||
def convert_megatron_checkpoint(args, input_state_dict):
|
||||
# The converted output model.
|
||||
output_state_dict = {}
|
||||
|
||||
# The number of heads.
|
||||
heads = 16
|
||||
# The hidden_size per head.
|
||||
hidden_size_per_head = 64
|
||||
|
||||
# The model.
|
||||
model = input_state_dict["model"]
|
||||
# The language model.
|
||||
lm = model["language_model"]
|
||||
# The embeddings.
|
||||
embeddings = lm["embedding"]
|
||||
|
||||
# The word embeddings.
|
||||
word_embeddings = embeddings["word_embeddings"]["weight"]
|
||||
# Truncate the embedding table to 50257 rows.
|
||||
word_embeddings = word_embeddings[:50257, :]
|
||||
# Truncate the embedding table to 50257 rows.
|
||||
output_state_dict["transformer.wte.weight"] = word_embeddings
|
||||
|
||||
# The position embeddings.
|
||||
pos_embeddings = embeddings["position_embeddings"]["weight"]
|
||||
# Read the hidden dimension.
|
||||
hidden_size = pos_embeddings.size(0)
|
||||
# DEBUG.
|
||||
assert hidden_size == heads * hidden_size_per_head
|
||||
# Store the position embeddings.
|
||||
output_state_dict["transformer.wpe.weight"] = pos_embeddings
|
||||
|
||||
# The transformer.
|
||||
transformer = lm["transformer"]
|
||||
|
||||
# The regex to extract layer names.
|
||||
layer_re = re.compile("layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
|
||||
|
||||
# The simple map of names for "automated" rules.
|
||||
megatron_to_transformers = {
|
||||
"attention.dense": ".attn.c_proj.",
|
||||
"mlp.dense_h_to_4h": ".mlp.c_fc.",
|
||||
"mlp.dense_4h_to_h": ".mlp.c_proj.",
|
||||
}
|
||||
|
||||
# Extract the layers.
|
||||
for key, val in transformer.items():
|
||||
# Match the name.
|
||||
m = layer_re.match(key)
|
||||
|
||||
# Stop if that's not a layer
|
||||
if m is None:
|
||||
break
|
||||
|
||||
# The index of the layer.
|
||||
layer_idx = int(m.group(1))
|
||||
# The name of the operation.
|
||||
op_name = m.group(2)
|
||||
# Is it a weight or a bias?
|
||||
weight_or_bias = m.group(3)
|
||||
|
||||
# The name of the layer.
|
||||
layer_name = f"transformer.h.{layer_idx}"
|
||||
|
||||
# For layernorm(s), simply store the layer norm.
|
||||
if op_name.endswith("layernorm"):
|
||||
|
||||
ln_name = "ln_1" if op_name.startswith("input") else "ln_2"
|
||||
output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val
|
||||
|
||||
# Transpose the QKV matrix.
|
||||
elif op_name == "attention.query_key_value" and weight_or_bias == "weight":
|
||||
|
||||
# Insert a tensor of 1x1xDxD bias.
|
||||
zeros = torch.ones(1, 1, hidden_size, hidden_size)
|
||||
output_state_dict[layer_name + ".attn.bias"] = zeros
|
||||
|
||||
# Insert a "dummy" tensor for masked_bias.
|
||||
masked_bias = torch.tensor(-1e4)
|
||||
output_state_dict[layer_name + ".attn.masked_bias"] = masked_bias
|
||||
|
||||
# Megatron stores (3*D) x D but transformers-GPT2 expects D x 3*D.
|
||||
out_val = val.transpose(0, 1)
|
||||
# Store.
|
||||
output_state_dict[layer_name + ".attn.c_attn.weight"] = out_val
|
||||
|
||||
# Transpose the bias.
|
||||
elif op_name == "attention.query_key_value" and weight_or_bias == "bias":
|
||||
|
||||
# Store. No change of shape.
|
||||
output_state_dict[layer_name + ".attn.c_attn.bias"] = val
|
||||
|
||||
# Transpose the weights.
|
||||
elif weight_or_bias == "weight":
|
||||
|
||||
out_name = megatron_to_transformers[op_name]
|
||||
output_state_dict[layer_name + out_name + "weight"] = val.transpose(0, 1)
|
||||
|
||||
# Copy the bias.
|
||||
elif weight_or_bias == "bias":
|
||||
|
||||
out_name = megatron_to_transformers[op_name]
|
||||
output_state_dict[layer_name + out_name + "bias"] = val
|
||||
|
||||
# The final layernorm.
|
||||
output_state_dict["transformer.ln_f.weight"] = transformer["final_layernorm.weight"]
|
||||
output_state_dict["transformer.ln_f.bias"] = transformer["final_layernorm.bias"]
|
||||
|
||||
# For LM head, transformers' wants the matrix to weight embeddings.
|
||||
output_state_dict["lm_head.weight"] = word_embeddings
|
||||
|
||||
# The config.
|
||||
output_config = {
|
||||
"activation_function": "gelu_new",
|
||||
"architectures": ["GPT2LMHeadModel"],
|
||||
"attn_pdrop": 0.1,
|
||||
"bos_token_id": 50256,
|
||||
"embd_pdrop": 0.1,
|
||||
"eos_token_id": 50256,
|
||||
"initializer_range": 0.02,
|
||||
"layer_norm_epsilon": 1e-05,
|
||||
"model_type": "gpt2",
|
||||
"n_ctx": 1024,
|
||||
"n_embd": 1024,
|
||||
"n_head": 16,
|
||||
"n_layer": 24,
|
||||
"n_positions": 1024,
|
||||
"resid_pdrop": 0.1,
|
||||
"summary_activation": None,
|
||||
"summary_first_dropout": 0.1,
|
||||
"summary_proj_to_labels": True,
|
||||
"summary_type": "cls_index",
|
||||
"summary_use_proj": True,
|
||||
"vocab_size": 50257,
|
||||
}
|
||||
|
||||
# It should be done!
|
||||
return output_state_dict, output_config
|
||||
|
||||
|
||||
####################################################################################################
|
||||
|
||||
|
||||
def main():
|
||||
# Create the argument parser.
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--print-checkpoint-structure", action="store_true")
|
||||
parser.add_argument("path_to_checkpoint", type=str, help="Path to the ZIP file containing the checkpoint")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Extract the basename.
|
||||
basename = os.path.dirname(args.path_to_checkpoint)
|
||||
|
||||
# Load the model.
|
||||
print('Extracting PyTorch state dictionary from "{}"'.format(args.path_to_checkpoint))
|
||||
with zipfile.ZipFile(args.path_to_checkpoint, "r") as checkpoint:
|
||||
with checkpoint.open("release/mp_rank_00/model_optim_rng.pt") as pytorch_dict:
|
||||
input_state_dict = torch.load(pytorch_dict, map_location="cpu")
|
||||
|
||||
# Convert.
|
||||
print("Converting")
|
||||
output_state_dict, output_config = convert_megatron_checkpoint(args, input_state_dict)
|
||||
|
||||
# Print the structure of converted state dict.
|
||||
if args.print_checkpoint_structure:
|
||||
recursive_print(None, output_state_dict)
|
||||
|
||||
# Store the config to file.
|
||||
output_config_file = os.path.join(basename, "config.json")
|
||||
print(f'Saving config to "{output_config_file}"')
|
||||
with open(output_config_file, "w") as f:
|
||||
json.dump(output_config, f)
|
||||
|
||||
# Store the state_dict to file.
|
||||
output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
|
||||
print(f'Saving checkpoint to "{output_checkpoint_file}"')
|
||||
torch.save(output_state_dict, output_checkpoint_file)
|
||||
|
||||
|
||||
####################################################################################################
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
####################################################################################################
|
@ -1840,6 +1840,78 @@ class MBartModel:
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
|
||||
|
||||
|
||||
class MegatronBertForCausalLM:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class MegatronBertForMaskedLM:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class MegatronBertForMultipleChoice:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class MegatronBertForNextSentencePrediction:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class MegatronBertForPreTraining:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class MegatronBertForQuestionAnswering:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class MegatronBertForSequenceClassification:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class MegatronBertForTokenClassification:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class MegatronBertModel:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class MMBTForClassification:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
@ -21,6 +21,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
|
||||
("BertConfig", "BertForQuestionAnswering"),
|
||||
("XLNetConfig", "XLNetForQuestionAnsweringSimple"),
|
||||
("FlaubertConfig", "FlaubertForQuestionAnsweringSimple"),
|
||||
("MegatronBertConfig", "MegatronBertForQuestionAnswering"),
|
||||
("MobileBertConfig", "MobileBertForQuestionAnswering"),
|
||||
("XLMConfig", "XLMForQuestionAnsweringSimple"),
|
||||
("ElectraConfig", "ElectraForQuestionAnswering"),
|
||||
|
377
tests/test_modeling_megatron_bert.py
Normal file
377
tests/test_modeling_megatron_bert.py
Normal file
@ -0,0 +1,377 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
|
||||
# Copyright 2021 NVIDIA Corporation. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Testing suite for the PyTorch MegatronBERT model. """
|
||||
|
||||
|
||||
import math
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from transformers import is_torch_available
|
||||
from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
|
||||
|
||||
from .test_configuration_common import ConfigTester
|
||||
from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
from transformers import (
|
||||
MODEL_FOR_PRETRAINING_MAPPING,
|
||||
MegatronBertConfig,
|
||||
MegatronBertForCausalLM,
|
||||
MegatronBertForMaskedLM,
|
||||
MegatronBertForMultipleChoice,
|
||||
MegatronBertForNextSentencePrediction,
|
||||
MegatronBertForPreTraining,
|
||||
MegatronBertForQuestionAnswering,
|
||||
MegatronBertForSequenceClassification,
|
||||
MegatronBertForTokenClassification,
|
||||
MegatronBertModel,
|
||||
)
|
||||
|
||||
|
||||
class MegatronBertModelTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=13,
|
||||
seq_length=7,
|
||||
is_training=True,
|
||||
use_input_mask=True,
|
||||
use_token_type_ids=True,
|
||||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=64,
|
||||
embedding_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
max_position_embeddings=512,
|
||||
type_vocab_size=16,
|
||||
type_sequence_label_size=2,
|
||||
initializer_range=0.02,
|
||||
num_labels=3,
|
||||
num_choices=4,
|
||||
scope=None,
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.seq_length = seq_length
|
||||
self.is_training = is_training
|
||||
self.use_input_mask = use_input_mask
|
||||
self.use_token_type_ids = use_token_type_ids
|
||||
self.use_labels = use_labels
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.embedding_size = embedding_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.intermediate_size = intermediate_size
|
||||
self.hidden_act = hidden_act
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.type_vocab_size = type_vocab_size
|
||||
self.type_sequence_label_size = type_sequence_label_size
|
||||
self.initializer_range = initializer_range
|
||||
self.num_labels = num_labels
|
||||
self.num_choices = num_choices
|
||||
self.scope = scope
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
|
||||
input_mask = None
|
||||
if self.use_input_mask:
|
||||
input_mask = random_attention_mask([self.batch_size, self.seq_length])
|
||||
|
||||
token_type_ids = None
|
||||
if self.use_token_type_ids:
|
||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||
|
||||
sequence_labels = None
|
||||
token_labels = None
|
||||
choice_labels = None
|
||||
if self.use_labels:
|
||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = MegatronBertConfig(
|
||||
vocab_size=self.vocab_size,
|
||||
hidden_size=self.hidden_size,
|
||||
num_hidden_layers=self.num_hidden_layers,
|
||||
num_attention_heads=self.num_attention_heads,
|
||||
intermediate_size=self.intermediate_size,
|
||||
embedding_size=self.embedding_size,
|
||||
hidden_act=self.hidden_act,
|
||||
hidden_dropout_prob=self.hidden_dropout_prob,
|
||||
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
type_vocab_size=self.type_vocab_size,
|
||||
is_decoder=False,
|
||||
initializer_range=self.initializer_range,
|
||||
)
|
||||
|
||||
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
|
||||
def create_and_check_megatron_bert_model(
|
||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
):
|
||||
model = MegatronBertModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
||||
result = model(input_ids, token_type_ids=token_type_ids)
|
||||
result = model(input_ids)
|
||||
|
||||
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
|
||||
self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
|
||||
|
||||
def create_and_check_megatron_bert_for_masked_lm(
|
||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
):
|
||||
model = MegatronBertForMaskedLM(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
|
||||
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
|
||||
|
||||
def create_and_check_for_causal_lm(
|
||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
):
|
||||
model = MegatronBertForCausalLM(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
|
||||
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
|
||||
|
||||
def create_and_check_megatron_bert_for_next_sequence_prediction(
|
||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
):
|
||||
model = MegatronBertForNextSentencePrediction(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
result = model(
|
||||
input_ids,
|
||||
attention_mask=input_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
labels=sequence_labels,
|
||||
)
|
||||
self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
|
||||
|
||||
def create_and_check_megatron_bert_for_pretraining(
|
||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
):
|
||||
model = MegatronBertForPreTraining(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
result = model(
|
||||
input_ids,
|
||||
attention_mask=input_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
labels=token_labels,
|
||||
next_sentence_label=sequence_labels,
|
||||
)
|
||||
self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
|
||||
self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2))
|
||||
|
||||
def create_and_check_megatron_bert_for_question_answering(
|
||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
):
|
||||
model = MegatronBertForQuestionAnswering(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
result = model(
|
||||
input_ids,
|
||||
attention_mask=input_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
start_positions=sequence_labels,
|
||||
end_positions=sequence_labels,
|
||||
)
|
||||
self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
|
||||
self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
|
||||
|
||||
def create_and_check_megatron_bert_for_sequence_classification(
|
||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
):
|
||||
config.num_labels = self.num_labels
|
||||
model = MegatronBertForSequenceClassification(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
|
||||
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
|
||||
|
||||
def create_and_check_megatron_bert_for_token_classification(
|
||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
):
|
||||
config.num_labels = self.num_labels
|
||||
model = MegatronBertForTokenClassification(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
|
||||
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
|
||||
|
||||
def create_and_check_megatron_bert_for_multiple_choice(
|
||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
):
|
||||
config.num_choices = self.num_choices
|
||||
model = MegatronBertForMultipleChoice(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
||||
multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
||||
multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
||||
result = model(
|
||||
multiple_choice_inputs_ids,
|
||||
attention_mask=multiple_choice_input_mask,
|
||||
token_type_ids=multiple_choice_token_type_ids,
|
||||
labels=choice_labels,
|
||||
)
|
||||
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
(
|
||||
config,
|
||||
input_ids,
|
||||
token_type_ids,
|
||||
input_mask,
|
||||
sequence_labels,
|
||||
token_labels,
|
||||
choice_labels,
|
||||
) = config_and_inputs
|
||||
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
|
||||
return config, inputs_dict
|
||||
|
||||
|
||||
@require_torch
|
||||
class MegatronBertModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
|
||||
all_model_classes = (
|
||||
(
|
||||
MegatronBertModel,
|
||||
MegatronBertForMaskedLM,
|
||||
MegatronBertForCausalLM,
|
||||
MegatronBertForMultipleChoice,
|
||||
MegatronBertForNextSentencePrediction,
|
||||
MegatronBertForPreTraining,
|
||||
MegatronBertForQuestionAnswering,
|
||||
MegatronBertForSequenceClassification,
|
||||
MegatronBertForTokenClassification,
|
||||
)
|
||||
if is_torch_available()
|
||||
else ()
|
||||
)
|
||||
|
||||
# test_resize_embeddings = False
|
||||
test_head_masking = False
|
||||
|
||||
# special case for ForPreTraining model
|
||||
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
|
||||
inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
|
||||
|
||||
if return_labels:
|
||||
if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
|
||||
inputs_dict["labels"] = torch.zeros(
|
||||
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
|
||||
)
|
||||
inputs_dict["next_sentence_label"] = torch.zeros(
|
||||
self.model_tester.batch_size, dtype=torch.long, device=torch_device
|
||||
)
|
||||
return inputs_dict
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = MegatronBertModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=MegatronBertConfig, hidden_size=37)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_megatron_bert_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_megatron_bert_model(*config_and_inputs)
|
||||
|
||||
def test_for_masked_lm(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_megatron_bert_for_masked_lm(*config_and_inputs)
|
||||
|
||||
def test_for_multiple_choice(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_megatron_bert_for_multiple_choice(*config_and_inputs)
|
||||
|
||||
def test_for_next_sequence_prediction(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_megatron_bert_for_next_sequence_prediction(*config_and_inputs)
|
||||
|
||||
def test_for_pretraining(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_megatron_bert_for_pretraining(*config_and_inputs)
|
||||
|
||||
def test_for_question_answering(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_megatron_bert_for_question_answering(*config_and_inputs)
|
||||
|
||||
def test_for_sequence_classification(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_megatron_bert_for_sequence_classification(*config_and_inputs)
|
||||
|
||||
def test_for_token_classification(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_megatron_bert_for_token_classification(*config_and_inputs)
|
||||
|
||||
|
||||
def _long_tensor(tok_lst):
|
||||
return torch.tensor(
|
||||
tok_lst,
|
||||
dtype=torch.long,
|
||||
device=torch_device,
|
||||
)
|
||||
|
||||
|
||||
TOLERANCE = 1e-4
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_sentencepiece
|
||||
@require_tokenizers
|
||||
class MegatronBertModelIntegrationTests(unittest.TestCase):
|
||||
@slow
|
||||
def test_inference_no_head(self):
|
||||
directory = "nvidia/megatron-bert-uncased-345m"
|
||||
if "MYDIR" in os.environ:
|
||||
directory = os.path.join(os.environ["MYDIR"], directory)
|
||||
model = MegatronBertModel.from_pretrained(directory)
|
||||
model.to(torch_device)
|
||||
model.half()
|
||||
input_ids = _long_tensor([[101, 7110, 1005, 1056, 2023, 11333, 17413, 1029, 102]])
|
||||
with torch.no_grad():
|
||||
output = model(input_ids)[0]
|
||||
expected_shape = torch.Size((1, 9, 1024))
|
||||
self.assertEqual(output.shape, expected_shape)
|
||||
|
||||
expected = [-0.6040, -0.2517, -0.1025, 0.3420, -0.6758, -0.0017, -0.1089, -0.1990, 0.5728]
|
||||
for ii in range(3):
|
||||
for jj in range(3):
|
||||
a = output[0, ii, jj]
|
||||
b = expected[3 * ii + jj]
|
||||
msg = "ii={} jj={} a={} b={}".format(ii, jj, a, b)
|
||||
self.assertTrue(math.isclose(a, b, rel_tol=TOLERANCE, abs_tol=TOLERANCE), msg=msg)
|
@ -45,6 +45,10 @@ IGNORE_NON_TESTED = [
|
||||
"BlenderbotDecoderWrapper", # Building part of bigger (tested) model.
|
||||
"MBartEncoder", # Building part of bigger (tested) model.
|
||||
"MBartDecoderWrapper", # Building part of bigger (tested) model.
|
||||
"MegatronBertLMHeadModel", # Building part of bigger (tested) model.
|
||||
"MegatronBertEncoder", # Building part of bigger (tested) model.
|
||||
"MegatronBertDecoder", # Building part of bigger (tested) model.
|
||||
"MegatronBertDecoderWrapper", # Building part of bigger (tested) model.
|
||||
"PegasusEncoder", # Building part of bigger (tested) model.
|
||||
"PegasusDecoderWrapper", # Building part of bigger (tested) model.
|
||||
"DPREncoder", # Building part of bigger (tested) model.
|
||||
|
Loading…
Reference in New Issue
Block a user