mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
Merge branch 'master' into tapas-tf
This commit is contained in:
commit
fbad9bb56e
@ -279,7 +279,7 @@ jobs:
|
||||
- v0.4-tf-{{ checksum "setup.py" }}
|
||||
- v0.4-{{ checksum "setup.py" }}
|
||||
- run: pip install --upgrade pip
|
||||
- run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech]
|
||||
- run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]
|
||||
- run: pip install tensorflow_probability
|
||||
- save_cache:
|
||||
key: v0.4-tf-{{ checksum "setup.py" }}
|
||||
@ -313,7 +313,7 @@ jobs:
|
||||
- v0.4-tf-{{ checksum "setup.py" }}
|
||||
- v0.4-{{ checksum "setup.py" }}
|
||||
- run: pip install --upgrade pip
|
||||
- run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech]
|
||||
- run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]
|
||||
- run: pip install tensorflow_probability
|
||||
- save_cache:
|
||||
key: v0.4-tf-{{ checksum "setup.py" }}
|
||||
|
7
.github/workflows/self-nightly-scheduled.yml
vendored
7
.github/workflows/self-nightly-scheduled.yml
vendored
@ -205,8 +205,9 @@ jobs:
|
||||
apt -y update && apt install -y libaio-dev
|
||||
pip install --upgrade pip
|
||||
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
|
||||
pip install .[testing,deepspeed,fairscale]
|
||||
pip install git+https://github.com/microsoft/DeepSpeed
|
||||
rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds
|
||||
pip install .[testing,fairscale]
|
||||
pip install git+https://github.com/microsoft/DeepSpeed # testing bleeding edge
|
||||
|
||||
- name: Are GPUs recognized by our DL frameworks
|
||||
run: |
|
||||
@ -218,7 +219,7 @@ jobs:
|
||||
- name: Run all tests on GPU
|
||||
run: |
|
||||
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
|
||||
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt
|
||||
|
13
.github/workflows/self-push.yml
vendored
13
.github/workflows/self-push.yml
vendored
@ -50,7 +50,7 @@ jobs:
|
||||
python -c "import torch; print('Cuda version:', torch.version.cuda)"
|
||||
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
|
||||
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
|
||||
|
||||
|
||||
- name: Fetch the tests to run
|
||||
run: |
|
||||
python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
|
||||
@ -105,7 +105,7 @@ jobs:
|
||||
run: |
|
||||
python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
|
||||
python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
|
||||
|
||||
|
||||
- name: Fetch the tests to run
|
||||
run: |
|
||||
python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
|
||||
@ -203,7 +203,7 @@ jobs:
|
||||
apt install -y libsndfile1-dev
|
||||
pip install --upgrade pip
|
||||
pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
|
||||
|
||||
|
||||
- name: Launcher docker
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
@ -277,7 +277,7 @@ jobs:
|
||||
# run: |
|
||||
# python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
|
||||
# python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
|
||||
#
|
||||
#
|
||||
# - name: Fetch the tests to run
|
||||
# run: |
|
||||
# python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
|
||||
@ -389,11 +389,11 @@ jobs:
|
||||
python -c "import torch; print('Cuda version:', torch.version.cuda)"
|
||||
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
|
||||
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
|
||||
|
||||
|
||||
- name: Fetch the tests to run
|
||||
run: |
|
||||
python utils/tests_fetcher.py --diff_with_last_commit --filters tests/deepspeed tests/extended | tee test_preparation.txt
|
||||
|
||||
|
||||
- name: Report fetched tests
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
@ -437,6 +437,7 @@ jobs:
|
||||
run: |
|
||||
apt -y update && apt install -y libaio-dev
|
||||
pip install --upgrade pip
|
||||
rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds
|
||||
pip install .[testing,deepspeed,fairscale]
|
||||
|
||||
- name: Are GPUs recognized by our DL frameworks
|
||||
|
5
.github/workflows/self-scheduled.yml
vendored
5
.github/workflows/self-scheduled.yml
vendored
@ -143,7 +143,7 @@ jobs:
|
||||
run: |
|
||||
apt -y update && apt install -y libsndfile1-dev git
|
||||
pip install --upgrade pip
|
||||
pip install .[sklearn,testing,onnx,sentencepiece,tf-speech]
|
||||
pip install .[sklearn,testing,onnx,sentencepiece,tf-speech,vision]
|
||||
|
||||
- name: Are GPUs recognized by our DL frameworks
|
||||
run: |
|
||||
@ -293,7 +293,7 @@ jobs:
|
||||
run: |
|
||||
apt -y update && apt install -y libsndfile1-dev git
|
||||
pip install --upgrade pip
|
||||
pip install .[sklearn,testing,onnx,sentencepiece,tf-speech]
|
||||
pip install .[sklearn,testing,onnx,sentencepiece,tf-speech,vision]
|
||||
|
||||
- name: Are GPUs recognized by our DL frameworks
|
||||
run: |
|
||||
@ -429,6 +429,7 @@ jobs:
|
||||
run: |
|
||||
apt -y update && apt install -y libaio-dev
|
||||
pip install --upgrade pip
|
||||
rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds
|
||||
pip install .[testing,deepspeed,fairscale]
|
||||
|
||||
- name: Are GPUs recognized by our DL frameworks
|
||||
|
@ -46,6 +46,20 @@ won't be possible on a single GPU.
|
||||
parts of DeepSpeed like ``zero.Init`` for ZeRO stage 3 and higher. To tap into this feature read the docs on
|
||||
:ref:`deepspeed-non-trainer-integration`.
|
||||
|
||||
What is integrated:
|
||||
|
||||
Training:
|
||||
|
||||
1. DeepSpeed ZeRO training supports the full ZeRO stages 1, 2 and 3 with ZeRO-Infinity (CPU and NVME offload).
|
||||
|
||||
Inference:
|
||||
|
||||
1. DeepSpeed ZeRO Inference supports ZeRO stage 3 with ZeRO-Infinity. It uses the same ZeRO protocol as training, but
|
||||
it doesn't use an optimizer and a lr scheduler and only stage 3 is relevant. For more details see:
|
||||
:ref:`deepspeed-zero-inference`.
|
||||
|
||||
There is also DeepSpeed Inference - this is a totally different technology which uses Tensor Parallelism instead of
|
||||
ZeRO (coming soon).
|
||||
|
||||
|
||||
|
||||
@ -1628,6 +1642,47 @@ larger multi-dimensional shape, this means that the parameter is partitioned and
|
||||
|
||||
|
||||
|
||||
.. _deepspeed-zero-inference:
|
||||
|
||||
|
||||
ZeRO Inference
|
||||
=======================================================================================================================
|
||||
|
||||
ZeRO Inference uses the same config as ZeRO-3 Training. You just don't need the optimizer and scheduler sections. In
|
||||
fact you can leave these in the config file if you want to share the same one with the training. They will just be
|
||||
ignored.
|
||||
|
||||
Otherwise you just need to pass the usual :class:`~transformers.TrainingArguments` arguments. For example:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
deepspeed --num_gpus=2 your_program.py <normal cl args> --do_eval --deepspeed ds_config.json
|
||||
|
||||
The only important thing is that you need to use a ZeRO-3 configuration, since ZeRO-2 provides no benefit whatsoever
|
||||
for the inference as only ZeRO-3 performs sharding of parameters, whereas ZeRO-1 shards gradients and optimizer states.
|
||||
|
||||
Here is an example of running ``run_translation.py`` under DeepSpeed deploying all available GPUs:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
deepspeed examples/pytorch/translation/run_translation.py \
|
||||
--deepspeed tests/deepspeed/ds_config_zero3.json \
|
||||
--model_name_or_path t5-small --output_dir output_dir \
|
||||
--do_eval --max_eval_samples 50 --warmup_steps 50 \
|
||||
--max_source_length 128 --val_max_target_length 128 \
|
||||
--overwrite_output_dir --per_device_eval_batch_size 4 \
|
||||
--predict_with_generate --dataset_config "ro-en" --fp16 \
|
||||
--source_lang en --target_lang ro --dataset_name wmt16 \
|
||||
--source_prefix "translate English to Romanian: "
|
||||
|
||||
Since for inference there is no need for additional large memory used by the optimizer states and the gradients you
|
||||
should be able to fit much larger batches and/or sequence length onto the same hardware.
|
||||
|
||||
|
||||
Additionally DeepSpeed is currently developing a related product called Deepspeed-Inference which has no relationship
|
||||
to the ZeRO technology, but instead uses tensor parallelism to scale models that can't fit onto a single GPU. This is a
|
||||
work in progress and we will provide the integration once that product is complete.
|
||||
|
||||
|
||||
Filing Issues
|
||||
=======================================================================================================================
|
||||
|
@ -39,7 +39,8 @@ methods for using all the tokenizers:
|
||||
- Managing special tokens (like mask, beginning-of-sentence, etc.): adding them, assigning them to attributes in the
|
||||
tokenizer for easy access and making sure they are not split during tokenization.
|
||||
|
||||
:class:`~transformers.BatchEncoding` holds the output of the tokenizer's encoding methods (``__call__``,
|
||||
:class:`~transformers.BatchEncoding` holds the output of the
|
||||
:class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase`'s encoding methods (``__call__``,
|
||||
``encode_plus`` and ``batch_encode_plus``) and is derived from a Python dictionary. When the tokenizer is a pure python
|
||||
tokenizer, this class behaves just like a standard python dictionary and holds the various model inputs computed by
|
||||
these methods (``input_ids``, ``attention_mask``...). When the tokenizer is a "Fast" tokenizer (i.e., backed by
|
||||
|
@ -96,10 +96,10 @@ ImageGPTModel
|
||||
:members: forward
|
||||
|
||||
|
||||
ImageGPTForCausalLM
|
||||
ImageGPTForCausalImageModeling
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.ImageGPTForCausalLM
|
||||
.. autoclass:: transformers.ImageGPTForCausalImageModeling
|
||||
:members: forward
|
||||
|
||||
|
||||
|
@ -51,6 +51,15 @@ The easiest way to use a pretrained model on a given task is to use :func:`~tran
|
||||
Let's see how this work for sentiment analysis (the other tasks are all covered in the :doc:`task summary
|
||||
</task_summary>`):
|
||||
|
||||
Install the following dependencies (if not already installed):
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
## PYTORCH CODE
|
||||
pip install torch
|
||||
## TENSORFLOW CODE
|
||||
pip install tensorflow
|
||||
|
||||
.. code-block::
|
||||
|
||||
>>> from transformers import pipeline
|
||||
@ -337,27 +346,42 @@ Once your model is fine-tuned, you can save it with its tokenizer in the followi
|
||||
|
||||
.. code-block::
|
||||
|
||||
tokenizer.save_pretrained(save_directory)
|
||||
model.save_pretrained(save_directory)
|
||||
>>> ## PYTORCH CODE
|
||||
>>> pt_save_directory = './pt_save_pretrained'
|
||||
>>> tokenizer.save_pretrained(pt_save_directory)
|
||||
>>> pt_model.save_pretrained(pt_save_directory)
|
||||
>>> ## TENSORFLOW CODE
|
||||
>>> tf_save_directory = './tf_save_pretrained'
|
||||
>>> tokenizer.save_pretrained(tf_save_directory)
|
||||
>>> tf_model.save_pretrained(tf_save_directory)
|
||||
|
||||
You can then load this model back using the :func:`~transformers.AutoModel.from_pretrained` method by passing the
|
||||
directory name instead of the model name. One cool feature of 🤗 Transformers is that you can easily switch between
|
||||
PyTorch and TensorFlow: any model saved as before can be loaded back either in PyTorch or TensorFlow. If you are
|
||||
loading a saved PyTorch model in a TensorFlow model, use :func:`~transformers.TFAutoModel.from_pretrained` like this:
|
||||
PyTorch and TensorFlow: any model saved as before can be loaded back either in PyTorch or TensorFlow.
|
||||
|
||||
|
||||
If you would like to load your saved model in the other framework, first make sure it is installed:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
## PYTORCH CODE
|
||||
pip install tensorflow
|
||||
## TENSORFLOW CODE
|
||||
pip install torch
|
||||
|
||||
Then, use the corresponding Auto class to load it like this:
|
||||
|
||||
.. code-block::
|
||||
|
||||
from transformers import TFAutoModel
|
||||
tokenizer = AutoTokenizer.from_pretrained(save_directory)
|
||||
model = TFAutoModel.from_pretrained(save_directory, from_pt=True)
|
||||
## PYTORCH CODE
|
||||
>>> from transformers import TFAutoModel
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
|
||||
>>> tf_model = TFAutoModel.from_pretrained(pt_save_directory, from_pt=True)
|
||||
## TENSORFLOW CODE
|
||||
>>> from transformers import AutoModel
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
|
||||
>>> pt_model = AutoModel.from_pretrained(tf_save_directory, from_tf=True)
|
||||
|
||||
and if you are loading a saved TensorFlow model in a PyTorch model, you should use the following code:
|
||||
|
||||
.. code-block::
|
||||
|
||||
from transformers import AutoModel
|
||||
tokenizer = AutoTokenizer.from_pretrained(save_directory)
|
||||
model = AutoModel.from_pretrained(save_directory, from_tf=True)
|
||||
|
||||
Lastly, you can also ask the model to return all hidden states and all attention weights if you need them:
|
||||
|
||||
|
@ -27,6 +27,7 @@ import os
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import Callable, Optional
|
||||
|
||||
@ -430,7 +431,7 @@ def main():
|
||||
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
|
||||
def group_texts(examples):
|
||||
# Concatenate all texts.
|
||||
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
||||
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||
# customize this part to your needs.
|
||||
|
@ -25,6 +25,7 @@ import os
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from itertools import chain
|
||||
|
||||
# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
|
||||
from pathlib import Path
|
||||
@ -453,7 +454,7 @@ if __name__ == "__main__":
|
||||
# max_seq_length.
|
||||
def group_texts(examples):
|
||||
# Concatenate all texts.
|
||||
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
||||
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||
# customize this part to your needs.
|
||||
|
@ -25,6 +25,7 @@ import os
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
@ -563,7 +564,7 @@ if __name__ == "__main__":
|
||||
# Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length.
|
||||
def group_texts(examples):
|
||||
# Concatenate all texts.
|
||||
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
||||
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||
# customize this part to your needs.
|
||||
|
@ -26,6 +26,7 @@ import math
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from itertools import chain
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
@ -408,7 +409,7 @@ def main():
|
||||
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
|
||||
def group_texts(examples):
|
||||
# Concatenate all texts.
|
||||
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
||||
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||
# customize this part to your needs.
|
||||
|
@ -27,6 +27,7 @@ import logging
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
|
||||
import datasets
|
||||
@ -366,7 +367,7 @@ def main():
|
||||
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
|
||||
def group_texts(examples):
|
||||
# Concatenate all texts.
|
||||
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
||||
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||
# customize this part to your needs.
|
||||
|
@ -26,6 +26,7 @@ import math
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from itertools import chain
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
@ -432,7 +433,7 @@ def main():
|
||||
# max_seq_length.
|
||||
def group_texts(examples):
|
||||
# Concatenate all texts.
|
||||
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
||||
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||
# customize this part to your needs.
|
||||
|
@ -27,6 +27,7 @@ import logging
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
|
||||
import datasets
|
||||
@ -406,7 +407,7 @@ def main():
|
||||
# max_seq_length.
|
||||
def group_texts(examples):
|
||||
# Concatenate all texts.
|
||||
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
||||
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||
# customize this part to your needs.
|
||||
|
@ -23,6 +23,7 @@ import math
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from itertools import chain
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
@ -403,7 +404,7 @@ def main():
|
||||
# max_seq_length.
|
||||
def group_texts(examples):
|
||||
# Concatenate all texts.
|
||||
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
||||
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||
# customize this part to your needs.
|
||||
|
@ -22,6 +22,7 @@ import logging
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from itertools import chain
|
||||
from typing import Optional, Union
|
||||
|
||||
import datasets
|
||||
@ -185,7 +186,7 @@ class DataCollatorForMultipleChoice:
|
||||
flattened_features = [
|
||||
[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
|
||||
]
|
||||
flattened_features = sum(flattened_features, [])
|
||||
flattened_features = list(chain(*flattened_features))
|
||||
|
||||
batch = self.tokenizer.pad(
|
||||
flattened_features,
|
||||
@ -333,8 +334,8 @@ def main():
|
||||
]
|
||||
|
||||
# Flatten out
|
||||
first_sentences = sum(first_sentences, [])
|
||||
second_sentences = sum(second_sentences, [])
|
||||
first_sentences = list(chain(*first_sentences))
|
||||
second_sentences = list(chain(*second_sentences))
|
||||
|
||||
# Tokenize
|
||||
tokenized_examples = tokenizer(
|
||||
|
@ -24,6 +24,7 @@ import math
|
||||
import os
|
||||
import random
|
||||
from dataclasses import dataclass
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
|
||||
@ -224,7 +225,7 @@ class DataCollatorForMultipleChoice:
|
||||
flattened_features = [
|
||||
[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
|
||||
]
|
||||
flattened_features = sum(flattened_features, [])
|
||||
flattened_features = list(chain(*flattened_features))
|
||||
|
||||
batch = self.tokenizer.pad(
|
||||
flattened_features,
|
||||
@ -365,8 +366,8 @@ def main():
|
||||
labels = examples[label_column_name]
|
||||
|
||||
# Flatten out
|
||||
first_sentences = sum(first_sentences, [])
|
||||
second_sentences = sum(second_sentences, [])
|
||||
first_sentences = list(chain(*first_sentences))
|
||||
second_sentences = list(chain(*second_sentences))
|
||||
|
||||
# Tokenize
|
||||
tokenized_examples = tokenizer(
|
||||
|
@ -23,6 +23,7 @@ import os
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import Callable, Optional
|
||||
|
||||
@ -364,7 +365,7 @@ def main():
|
||||
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
|
||||
def group_texts(examples):
|
||||
# Concatenate all texts.
|
||||
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
||||
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||
# customize this part to your needs.
|
||||
|
@ -30,6 +30,7 @@ import random
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from functools import partial
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
@ -406,7 +407,7 @@ def main():
|
||||
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
|
||||
def group_texts(examples):
|
||||
# Concatenate all texts.
|
||||
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
||||
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||
# customize this part to your needs.
|
||||
|
@ -32,6 +32,7 @@ import random
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from functools import partial
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
@ -462,7 +463,7 @@ def main():
|
||||
# max_seq_length.
|
||||
def group_texts(examples):
|
||||
# Concatenate all texts.
|
||||
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
||||
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||
# customize this part to your needs.
|
||||
|
@ -22,6 +22,7 @@ import logging
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
@ -342,8 +343,8 @@ def main():
|
||||
]
|
||||
|
||||
# Flatten out
|
||||
first_sentences = sum(first_sentences, [])
|
||||
second_sentences = sum(second_sentences, [])
|
||||
first_sentences = list(chain(*first_sentences))
|
||||
second_sentences = list(chain(*second_sentences))
|
||||
|
||||
# Tokenize
|
||||
tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, max_length=max_seq_length)
|
||||
|
2
setup.py
2
setup.py
@ -97,7 +97,7 @@ _deps = [
|
||||
"cookiecutter==1.7.2",
|
||||
"dataclasses",
|
||||
"datasets",
|
||||
"deepspeed>=0.5.3",
|
||||
"deepspeed>=0.5.7",
|
||||
"docutils==0.16.0",
|
||||
"fairscale>0.3",
|
||||
"faiss-cpu",
|
||||
|
@ -94,7 +94,8 @@ _import_structure = {
|
||||
"DataCollatorWithPadding",
|
||||
"default_data_collator",
|
||||
],
|
||||
"feature_extraction_sequence_utils": ["BatchFeature", "SequenceFeatureExtractor"],
|
||||
"feature_extraction_sequence_utils": ["SequenceFeatureExtractor"],
|
||||
"feature_extraction_utils": ["BatchFeature"],
|
||||
"file_utils": [
|
||||
"CONFIG_NAME",
|
||||
"MODEL_CARD_NAME",
|
||||
@ -618,6 +619,7 @@ if is_torch_available():
|
||||
_import_structure["models.auto"].extend(
|
||||
[
|
||||
"MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
|
||||
"MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING",
|
||||
"MODEL_FOR_CAUSAL_LM_MAPPING",
|
||||
"MODEL_FOR_CTC_MAPPING",
|
||||
"MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
|
||||
@ -976,7 +978,7 @@ if is_torch_available():
|
||||
_import_structure["models.imagegpt"].extend(
|
||||
[
|
||||
"IMAGEGPT_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
"ImageGPTForCausalLM",
|
||||
"ImageGPTForCausalImageModeling",
|
||||
"ImageGPTForImageClassification",
|
||||
"ImageGPTModel",
|
||||
"ImageGPTPreTrainedModel",
|
||||
@ -2071,9 +2073,10 @@ if TYPE_CHECKING:
|
||||
DataCollatorWithPadding,
|
||||
default_data_collator,
|
||||
)
|
||||
from .feature_extraction_sequence_utils import SequenceFeatureExtractor
|
||||
|
||||
# Feature Extractor
|
||||
from .feature_extraction_utils import BatchFeature, SequenceFeatureExtractor
|
||||
from .feature_extraction_utils import BatchFeature
|
||||
|
||||
# Files and general utilities
|
||||
from .file_utils import (
|
||||
@ -2531,6 +2534,7 @@ if TYPE_CHECKING:
|
||||
)
|
||||
from .models.auto import (
|
||||
MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
|
||||
MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
|
||||
MODEL_FOR_CAUSAL_LM_MAPPING,
|
||||
MODEL_FOR_CTC_MAPPING,
|
||||
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
|
||||
@ -2833,7 +2837,7 @@ if TYPE_CHECKING:
|
||||
)
|
||||
from .models.imagegpt import (
|
||||
IMAGEGPT_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
ImageGPTForCausalLM,
|
||||
ImageGPTForCausalImageModeling,
|
||||
ImageGPTForImageClassification,
|
||||
ImageGPTModel,
|
||||
ImageGPTPreTrainedModel,
|
||||
|
@ -111,6 +111,29 @@ class HfDeepSpeedConfig:
|
||||
return default
|
||||
return config.get(ds_key, default)
|
||||
|
||||
def del_config_sub_tree(self, ds_key_long, must_exist=False):
|
||||
"""
|
||||
Deletes a sub-section of the config file if it's found.
|
||||
|
||||
Unless ``must_exist`` is :obj:`True` the section doesn't have to exist.
|
||||
"""
|
||||
config = self.config
|
||||
|
||||
# find the config node of interest if it exists
|
||||
nodes = ds_key_long.split(".")
|
||||
for node in nodes:
|
||||
parent_config = config
|
||||
config = config.get(node)
|
||||
if config is None:
|
||||
if must_exist:
|
||||
raise ValueError(f"Can't find {ds_key_long} entry in the config: {self.config}")
|
||||
else:
|
||||
return
|
||||
|
||||
# if found remove it
|
||||
if parent_config is not None:
|
||||
parent_config.pop(node)
|
||||
|
||||
def is_true(self, ds_key_long):
|
||||
"""
|
||||
Returns :obj:`True`/:obj:`False` only if the value is set, always :obj:`False` otherwise. So use this method to
|
||||
@ -280,30 +303,10 @@ def deepspeed_config():
|
||||
return None
|
||||
|
||||
|
||||
def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None):
|
||||
def deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps):
|
||||
"""
|
||||
Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args.
|
||||
|
||||
If ``resume_from_checkpoint`` was passed then an attempt to resume from a previously saved checkpoint will be made.
|
||||
|
||||
Args:
|
||||
trainer: Trainer object
|
||||
num_training_steps: per single gpu
|
||||
resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load
|
||||
|
||||
Returns: model, optimizer, lr_scheduler
|
||||
|
||||
A convenience wrapper that deals with optimizer and lr scheduler configuration.
|
||||
"""
|
||||
import deepspeed
|
||||
from deepspeed.utils import logger as ds_logger
|
||||
|
||||
model = trainer.model
|
||||
args = trainer.args
|
||||
|
||||
hf_deepspeed_config = args.hf_deepspeed_config
|
||||
hf_deepspeed_config.trainer_config_finalize(args, model, num_training_steps)
|
||||
|
||||
# resume config update - some bits like `model` and `num_training_steps` only become available during train
|
||||
config = hf_deepspeed_config.config
|
||||
|
||||
# Optimizer + Scheduler
|
||||
@ -351,13 +354,54 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None):
|
||||
else:
|
||||
lr_scheduler = trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
|
||||
|
||||
# keep for quick debug:
|
||||
# from pprint import pprint; pprint(config)
|
||||
return optimizer, lr_scheduler
|
||||
|
||||
# set the Deepspeed log level consistent with the trainer
|
||||
|
||||
def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None, inference=False):
|
||||
"""
|
||||
Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args.
|
||||
|
||||
If ``resume_from_checkpoint`` was passed then an attempt to resume from a previously saved checkpoint will be made.
|
||||
|
||||
Args:
|
||||
trainer: Trainer object
|
||||
num_training_steps: per single gpu
|
||||
resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load
|
||||
inference: launch in inference mode (no optimizer and no lr scheduler)
|
||||
|
||||
Returns: model, optimizer, lr_scheduler
|
||||
|
||||
"""
|
||||
import deepspeed
|
||||
from deepspeed.utils import logger as ds_logger
|
||||
|
||||
model = trainer.model
|
||||
args = trainer.args
|
||||
|
||||
# resume config update - some bits like `model` and `num_training_steps` only become available during train
|
||||
hf_deepspeed_config = args.hf_deepspeed_config
|
||||
hf_deepspeed_config.trainer_config_finalize(args, model, num_training_steps)
|
||||
config = hf_deepspeed_config.config
|
||||
|
||||
# set the Deepspeed log level consistent with the Trainer
|
||||
ds_logger.setLevel(args.get_process_log_level())
|
||||
|
||||
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
|
||||
if inference:
|
||||
# only Z3 makes sense for the inference
|
||||
if not hf_deepspeed_config.is_zero3():
|
||||
raise ValueError("ZeRO inference only makes sense with ZeRO Stage 3 - please adjust your config")
|
||||
|
||||
# in case the training config is re-used for inference
|
||||
hf_deepspeed_config.del_config_sub_tree("optimizer")
|
||||
hf_deepspeed_config.del_config_sub_tree("lr_scheduler")
|
||||
optimizer, lr_scheduler = None, None
|
||||
model_parameters = None
|
||||
else:
|
||||
optimizer, lr_scheduler = deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps)
|
||||
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
|
||||
|
||||
# keep for quick debug:
|
||||
# from pprint import pprint; pprint(config)
|
||||
|
||||
model, optimizer, _, lr_scheduler = deepspeed.initialize(
|
||||
model=model,
|
||||
|
@ -8,7 +8,7 @@ deps = {
|
||||
"cookiecutter": "cookiecutter==1.7.2",
|
||||
"dataclasses": "dataclasses",
|
||||
"datasets": "datasets",
|
||||
"deepspeed": "deepspeed>=0.5.3",
|
||||
"deepspeed": "deepspeed>=0.5.7",
|
||||
"docutils": "docutils==0.16.0",
|
||||
"fairscale": "fairscale>0.3",
|
||||
"faiss-cpu": "faiss-cpu",
|
||||
|
@ -35,6 +35,7 @@ from dataclasses import fields
|
||||
from enum import Enum
|
||||
from functools import partial, wraps
|
||||
from hashlib import sha256
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from types import ModuleType
|
||||
from typing import Any, BinaryIO, ContextManager, Dict, List, Optional, Tuple, Union
|
||||
@ -2148,7 +2149,7 @@ class _LazyModule(ModuleType):
|
||||
for value in values:
|
||||
self._class_to_module[value] = key
|
||||
# Needed for autocompletion in an IDE
|
||||
self.__all__ = list(import_structure.keys()) + sum(import_structure.values(), [])
|
||||
self.__all__ = list(import_structure.keys()) + list(chain(*import_structure.values()))
|
||||
self.__file__ = module_file
|
||||
self.__spec__ = module_spec
|
||||
self.__path__ = [os.path.dirname(module_file)]
|
||||
|
@ -32,6 +32,7 @@ _import_structure = {
|
||||
if is_torch_available():
|
||||
_import_structure["modeling_auto"] = [
|
||||
"MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
|
||||
"MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING",
|
||||
"MODEL_FOR_CAUSAL_LM_MAPPING",
|
||||
"MODEL_FOR_CTC_MAPPING",
|
||||
"MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
|
||||
@ -139,6 +140,7 @@ if TYPE_CHECKING:
|
||||
if is_torch_available():
|
||||
from .modeling_auto import (
|
||||
MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
|
||||
MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
|
||||
MODEL_FOR_CAUSAL_LM_MAPPING,
|
||||
MODEL_FOR_CTC_MAPPING,
|
||||
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
|
||||
|
@ -147,7 +147,6 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
|
||||
MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
|
||||
[
|
||||
# Model with LM heads mapping
|
||||
("imagegpt", "ImageGPTForCausalLM"),
|
||||
("qdqbert", "QDQBertForMaskedLM"),
|
||||
("fnet", "FNetForMaskedLM"),
|
||||
("gptj", "GPTJForCausalLM"),
|
||||
@ -199,7 +198,6 @@ MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
|
||||
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
|
||||
[
|
||||
# Model for Causal LM mapping
|
||||
("imagegpt", "ImageGPTForCausalLM"),
|
||||
("qdqbert", "QDQBertLMHeadModel"),
|
||||
("trocr", "TrOCRForCausalLM"),
|
||||
("gptj", "GPTJForCausalLM"),
|
||||
@ -233,6 +231,13 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
|
||||
]
|
||||
)
|
||||
|
||||
MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING_NAMES = OrderedDict(
|
||||
# Model for Causal Image Modeling mapping
|
||||
[
|
||||
("imagegpt", "ImageGPTForCausalImageModeling"),
|
||||
]
|
||||
)
|
||||
|
||||
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
[
|
||||
# Model for Image Classification mapping
|
||||
@ -524,6 +529,9 @@ MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
|
||||
MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_PRETRAINING_MAPPING_NAMES)
|
||||
MODEL_WITH_LM_HEAD_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_WITH_LM_HEAD_MAPPING_NAMES)
|
||||
MODEL_FOR_CAUSAL_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
|
||||
MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING = _LazyAutoMapping(
|
||||
CONFIG_MAPPING_NAMES, MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING_NAMES
|
||||
)
|
||||
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
|
||||
CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
|
||||
)
|
||||
|
@ -444,7 +444,7 @@ class FlaxGPT2PreTrainedModel(FlaxPreTrainedModel):
|
||||
init_variables = self.module.init(
|
||||
jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
|
||||
)
|
||||
return init_variables["cache"]
|
||||
return unfreeze(init_variables["cache"])
|
||||
|
||||
@add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
|
||||
def __call__(
|
||||
|
@ -388,7 +388,7 @@ class FlaxGPTNeoPreTrainedModel(FlaxPreTrainedModel):
|
||||
init_variables = self.module.init(
|
||||
jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
|
||||
)
|
||||
return init_variables["cache"]
|
||||
return unfreeze(init_variables["cache"])
|
||||
|
||||
@add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
|
||||
def __call__(
|
||||
|
@ -31,7 +31,7 @@ if is_vision_available():
|
||||
if is_torch_available():
|
||||
_import_structure["modeling_imagegpt"] = [
|
||||
"IMAGEGPT_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
"ImageGPTForCausalLM",
|
||||
"ImageGPTForCausalImageModeling",
|
||||
"ImageGPTForImageClassification",
|
||||
"ImageGPTModel",
|
||||
"ImageGPTPreTrainedModel",
|
||||
@ -48,7 +48,7 @@ if TYPE_CHECKING:
|
||||
if is_torch_available():
|
||||
from .modeling_imagegpt import (
|
||||
IMAGEGPT_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
ImageGPTForCausalLM,
|
||||
ImageGPTForCausalImageModeling,
|
||||
ImageGPTForImageClassification,
|
||||
ImageGPTModel,
|
||||
ImageGPTPreTrainedModel,
|
||||
|
@ -881,7 +881,7 @@ class ImageGPTModel(ImageGPTPreTrainedModel):
|
||||
""",
|
||||
IMAGEGPT_START_DOCSTRING,
|
||||
)
|
||||
class ImageGPTForCausalLM(ImageGPTPreTrainedModel):
|
||||
class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -958,13 +958,13 @@ class ImageGPTForCausalLM(ImageGPTPreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> from transformers import ImageGPTFeatureExtractor, ImageGPTForCausalLM
|
||||
>>> from transformers import ImageGPTFeatureExtractor, ImageGPTForCausalImageModeling
|
||||
>>> import torch
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> import numpy as np
|
||||
|
||||
>>> feature_extractor = ImageGPTFeatureExtractor.from_pretrained('openai/imagegpt-small')
|
||||
>>> model = ImageGPTForCausalLM.from_pretrained('openai/imagegpt-small')
|
||||
>>> model = ImageGPTForCausalImageModeling.from_pretrained('openai/imagegpt-small')
|
||||
>>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
>>> model.to(device)
|
||||
|
||||
|
@ -47,11 +47,11 @@ def normalize_box(box, width, height):
|
||||
]
|
||||
|
||||
|
||||
def apply_tesseract(image: Image.Image):
|
||||
def apply_tesseract(image: Image.Image, lang: Optional[str]):
|
||||
"""Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
|
||||
|
||||
# apply OCR
|
||||
data = pytesseract.image_to_data(image, output_type="dict")
|
||||
data = pytesseract.image_to_data(image, lang=lang, output_type="dict")
|
||||
words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
|
||||
|
||||
# filter empty words and corresponding coordinates
|
||||
@ -102,6 +102,9 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
|
||||
Only has an effect if :obj:`do_resize` is set to :obj:`True`.
|
||||
apply_ocr (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes.
|
||||
ocr_lang (:obj:`Optional[str]`, `optional`):
|
||||
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
|
||||
used.
|
||||
|
||||
.. note::
|
||||
|
||||
@ -110,12 +113,13 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
|
||||
|
||||
model_input_names = ["pixel_values"]
|
||||
|
||||
def __init__(self, do_resize=True, size=224, resample=Image.BILINEAR, apply_ocr=True, **kwargs):
|
||||
def __init__(self, do_resize=True, size=224, resample=Image.BILINEAR, apply_ocr=True, ocr_lang=None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.do_resize = do_resize
|
||||
self.size = size
|
||||
self.resample = resample
|
||||
self.apply_ocr = apply_ocr
|
||||
self.ocr_lang = ocr_lang
|
||||
if apply_ocr:
|
||||
requires_backends(self, "pytesseract")
|
||||
|
||||
@ -199,7 +203,7 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
|
||||
words_batch = []
|
||||
boxes_batch = []
|
||||
for image in images:
|
||||
words, boxes = apply_tesseract(self.to_pil_image(image))
|
||||
words, boxes = apply_tesseract(self.to_pil_image(image), self.ocr_lang)
|
||||
words_batch.append(words)
|
||||
boxes_batch.append(boxes)
|
||||
|
||||
|
@ -1275,7 +1275,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
if "bbox" in encoded_inputs:
|
||||
encoded_inputs["bbox"] = [self.pad_token_box] * difference + encoded_inputs["bbox"]
|
||||
if "labels" in encoded_inputs:
|
||||
encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["bbox"]
|
||||
encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["labels"]
|
||||
if "special_tokens_mask" in encoded_inputs:
|
||||
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
|
||||
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
||||
|
@ -746,7 +746,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
|
||||
if "bbox" in encoded_inputs:
|
||||
encoded_inputs["bbox"] = [self.pad_token_box] * difference + encoded_inputs["bbox"]
|
||||
if "labels" in encoded_inputs:
|
||||
encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["bbox"]
|
||||
encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["labels"]
|
||||
if "special_tokens_mask" in encoded_inputs:
|
||||
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
|
||||
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
||||
|
@ -1051,7 +1051,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
|
||||
if "bbox" in encoded_inputs:
|
||||
encoded_inputs["bbox"] = [self.pad_token_box] * difference + encoded_inputs["bbox"]
|
||||
if "labels" in encoded_inputs:
|
||||
encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["bbox"]
|
||||
encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["labels"]
|
||||
if "special_tokens_mask" in encoded_inputs:
|
||||
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
|
||||
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
||||
|
@ -614,7 +614,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
if "bbox" in encoded_inputs:
|
||||
encoded_inputs["bbox"] = [self.pad_token_box] * difference + encoded_inputs["bbox"]
|
||||
if "labels" in encoded_inputs:
|
||||
encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["bbox"]
|
||||
encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["labels"]
|
||||
if "special_tokens_mask" in encoded_inputs:
|
||||
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
|
||||
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
||||
|
@ -1747,6 +1747,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
init_configuration,
|
||||
*init_inputs,
|
||||
use_auth_token=use_auth_token,
|
||||
cache_dir=cache_dir,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@ -1758,6 +1759,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
init_configuration,
|
||||
*init_inputs,
|
||||
use_auth_token=None,
|
||||
cache_dir=None,
|
||||
**kwargs
|
||||
):
|
||||
# We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
|
||||
@ -1797,7 +1799,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
|
||||
# Second attempt. If we have not yet found tokenizer_class, let's try to use the config.
|
||||
try:
|
||||
config = AutoConfig.from_pretrained(pretrained_model_name_or_path, use_auth_token=use_auth_token)
|
||||
config = AutoConfig.from_pretrained(
|
||||
pretrained_model_name_or_path,
|
||||
use_auth_token=use_auth_token,
|
||||
cache_dir=cache_dir,
|
||||
)
|
||||
config_tokenizer_class = config.tokenizer_class
|
||||
except (OSError, ValueError, KeyError):
|
||||
# skip if an error occurred.
|
||||
|
@ -2229,15 +2229,12 @@ class Trainer:
|
||||
|
||||
# XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval
|
||||
# from the checkpoint eventually
|
||||
deepspeed_engine, _, _ = deepspeed_init(self, num_training_steps=0, resume_from_checkpoint=None)
|
||||
deepspeed_engine, _, _ = deepspeed_init(
|
||||
self, num_training_steps=0, resume_from_checkpoint=None, inference=True
|
||||
)
|
||||
self.model = deepspeed_engine.module
|
||||
self.model_wrapped = deepspeed_engine
|
||||
self.deepspeed = deepspeed_engine
|
||||
# XXX: we don't need optim/sched for inference, but this needs to be sorted out, since
|
||||
# for example the Z3-optimizer is a must for zero3 to work even for inference - what we
|
||||
# don't need is the deepspeed basic optimizer which is self.optimizer.optimizer
|
||||
deepspeed_engine.optimizer.optimizer = None
|
||||
deepspeed_engine.lr_scheduler = None
|
||||
|
||||
model = self._wrap_model(self.model, training=False)
|
||||
|
||||
|
@ -341,6 +341,9 @@ def load_tf_weights_in_albert(*args, **kwargs):
|
||||
MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = None
|
||||
|
||||
|
||||
MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING = None
|
||||
|
||||
|
||||
MODEL_FOR_CAUSAL_LM_MAPPING = None
|
||||
|
||||
|
||||
@ -2661,7 +2664,7 @@ class IBertPreTrainedModel:
|
||||
IMAGEGPT_PRETRAINED_MODEL_ARCHIVE_LIST = None
|
||||
|
||||
|
||||
class ImageGPTForCausalLM:
|
||||
class ImageGPTForCausalImageModeling:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
@ -697,11 +697,10 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
def test_basic_distributed(self, stage):
|
||||
self.run_and_check(stage=stage, distributed=True)
|
||||
|
||||
@parameterized.expand(stages)
|
||||
def test_do_eval_no_train(self, stage):
|
||||
# we should not fail if train is skipped
|
||||
def test_do_eval_no_train(self):
|
||||
# testing only zero3 since zero2 makes no sense with inference
|
||||
self.run_and_check(
|
||||
stage=stage,
|
||||
stage=ZERO3,
|
||||
eval_steps=1,
|
||||
distributed=False,
|
||||
do_train=False,
|
||||
@ -755,6 +754,22 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
|
||||
self.do_checks(output_dir, do_train=do_train, do_eval=do_eval)
|
||||
|
||||
@require_torch_multi_gpu
|
||||
@parameterized.expand(["fp16", "fp32"])
|
||||
def test_inference(self, dtype):
|
||||
# this is just inference, so no optimizer should be loaded
|
||||
# it only works for z3 (makes no sense with z1-z2)
|
||||
fp16 = True if dtype == "fp16" else False
|
||||
self.run_and_check(
|
||||
stage=ZERO3,
|
||||
model_name=T5_TINY,
|
||||
distributed=True,
|
||||
do_train=False,
|
||||
do_eval=True,
|
||||
quality_checks=False,
|
||||
fp16=fp16,
|
||||
)
|
||||
|
||||
def do_checks(self, output_dir, do_train=True, do_eval=True, quality_checks=True):
|
||||
|
||||
if do_train:
|
||||
|
@ -414,6 +414,7 @@ def prepare_img():
|
||||
return image
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class BeitModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
|
@ -61,6 +61,7 @@ if is_torch_available():
|
||||
|
||||
from transformers import (
|
||||
BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
|
||||
MODEL_FOR_CAUSAL_LM_MAPPING,
|
||||
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
|
||||
MODEL_FOR_MASKED_LM_MAPPING,
|
||||
@ -150,6 +151,7 @@ class ModelTesterMixin:
|
||||
elif model_class in [
|
||||
*get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
|
||||
*get_values(MODEL_FOR_CAUSAL_LM_MAPPING),
|
||||
*get_values(MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING),
|
||||
*get_values(MODEL_FOR_MASKED_LM_MAPPING),
|
||||
*get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
|
||||
]:
|
||||
|
@ -391,6 +391,7 @@ def prepare_img():
|
||||
return image
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class DeiTModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
|
@ -34,7 +34,7 @@ if is_torch_available():
|
||||
|
||||
from transformers import (
|
||||
IMAGEGPT_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
ImageGPTForCausalLM,
|
||||
ImageGPTForCausalImageModeling,
|
||||
ImageGPTForImageClassification,
|
||||
ImageGPTModel,
|
||||
)
|
||||
@ -207,14 +207,14 @@ class ImageGPTModelTester:
|
||||
self.parent.assertEqual(len(result.past_key_values), config.n_layer)
|
||||
|
||||
def create_and_check_lm_head_model(self, config, pixel_values, input_mask, head_mask, token_type_ids, *args):
|
||||
model = ImageGPTForCausalLM(config)
|
||||
model = ImageGPTForCausalImageModeling(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size - 1)
|
||||
result = model(pixel_values, token_type_ids=token_type_ids, labels=labels)
|
||||
self.parent.assertEqual(result.loss.shape, ())
|
||||
# ImageGPTForCausalLM doens't have tied input- and output embeddings
|
||||
# ImageGPTForCausalImageModeling doens't have tied input- and output embeddings
|
||||
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size - 1))
|
||||
|
||||
def create_and_check_imagegpt_for_image_classification(
|
||||
@ -255,9 +255,9 @@ class ImageGPTModelTester:
|
||||
class ImageGPTModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
|
||||
|
||||
all_model_classes = (
|
||||
(ImageGPTForCausalLM, ImageGPTForImageClassification, ImageGPTModel) if is_torch_available() else ()
|
||||
(ImageGPTForCausalImageModeling, ImageGPTForImageClassification, ImageGPTModel) if is_torch_available() else ()
|
||||
)
|
||||
all_generative_model_classes = (ImageGPTForCausalLM,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (ImageGPTForCausalImageModeling,) if is_torch_available() else ()
|
||||
test_missing_keys = False
|
||||
input_name = "pixel_values"
|
||||
|
||||
@ -273,7 +273,7 @@ class ImageGPTModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCa
|
||||
|
||||
return inputs_dict
|
||||
|
||||
# we overwrite the _check_scores method of GenerationTesterMixin, as ImageGPTForCausalLM doesn't have tied input- and output embeddings
|
||||
# we overwrite the _check_scores method of GenerationTesterMixin, as ImageGPTForCausalImageModeling doesn't have tied input- and output embeddings
|
||||
def _check_scores(self, batch_size, scores, length, config):
|
||||
expected_shape = (batch_size, config.vocab_size - 1)
|
||||
self.assertIsInstance(scores, tuple)
|
||||
@ -519,7 +519,7 @@ class ImageGPTModelIntegrationTest(unittest.TestCase):
|
||||
|
||||
@slow
|
||||
def test_inference_causal_lm_head(self):
|
||||
model = ImageGPTForCausalLM.from_pretrained("openai/imagegpt-small").to(torch_device)
|
||||
model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small").to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image = prepare_img()
|
||||
|
@ -353,7 +353,7 @@ class TFViTModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
|
||||
model = TFViTModel.from_pretrained("google/vit-base-patch16-224", from_pt=True)
|
||||
model = TFViTModel.from_pretrained("google/vit-base-patch16-224")
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
|
||||
@ -363,6 +363,7 @@ def prepare_img():
|
||||
return image
|
||||
|
||||
|
||||
@require_tf
|
||||
@require_vision
|
||||
class TFViTModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
|
@ -331,6 +331,7 @@ def prepare_img():
|
||||
return image
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class ViTModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
|
@ -114,12 +114,12 @@ class AudioClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
|
||||
audio = np.array(dataset[3]["speech"], dtype=np.float32)
|
||||
output = audio_classifier(audio, top_k=4)
|
||||
self.assertEqual(
|
||||
nested_simplify(output, decimals=4),
|
||||
nested_simplify(output, decimals=3),
|
||||
[
|
||||
{"score": 0.9809, "label": "go"},
|
||||
{"score": 0.0073, "label": "up"},
|
||||
{"score": 0.0064, "label": "_unknown_"},
|
||||
{"score": 0.0015, "label": "down"},
|
||||
{"score": 0.981, "label": "go"},
|
||||
{"score": 0.007, "label": "up"},
|
||||
{"score": 0.006, "label": "_unknown_"},
|
||||
{"score": 0.001, "label": "down"},
|
||||
],
|
||||
)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user