diff --git a/docs/source/model_doc/auto.rst b/docs/source/model_doc/auto.rst index 928d5184614..af64f83d5f8 100644 --- a/docs/source/model_doc/auto.rst +++ b/docs/source/model_doc/auto.rst @@ -142,6 +142,20 @@ AutoModelForAudioClassification :members: +AutoModelForCTC +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.AutoModelForCTC + :members: + + +AutoModelForSpeechSeq2Seq +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.AutoModelForSpeechSeq2Seq + :members: + + AutoModelForObjectDetection ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 5c633e8e9b3..98db9f5de02 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -557,6 +557,7 @@ if is_torch_available(): "AutoModel", "AutoModelForAudioClassification", "AutoModelForCausalLM", + "AutoModelForCTC", "AutoModelForImageClassification", "AutoModelForMaskedLM", "AutoModelForMultipleChoice", @@ -566,6 +567,7 @@ if is_torch_available(): "AutoModelForQuestionAnswering", "AutoModelForSeq2SeqLM", "AutoModelForSequenceClassification", + "AutoModelForSpeechSeq2Seq", "AutoModelForTableQuestionAnswering", "AutoModelForTokenClassification", "AutoModelWithLMHead", @@ -2320,6 +2322,7 @@ if TYPE_CHECKING: AutoModel, AutoModelForAudioClassification, AutoModelForCausalLM, + AutoModelForCTC, AutoModelForImageClassification, AutoModelForMaskedLM, AutoModelForMultipleChoice, @@ -2329,6 +2332,7 @@ if TYPE_CHECKING: AutoModelForQuestionAnswering, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, + AutoModelForSpeechSeq2Seq, AutoModelForTableQuestionAnswering, AutoModelForTokenClassification, AutoModelWithLMHead, diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py index 6b99d6a91ba..1666f483a7e 100644 --- a/src/transformers/models/auto/__init__.py +++ b/src/transformers/models/auto/__init__.py @@ -32,6 +32,7 @@ if is_torch_available(): _import_structure["modeling_auto"] = [ "MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING", "MODEL_FOR_CAUSAL_LM_MAPPING", + "MODEL_FOR_CTC_MAPPING", "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING", "MODEL_FOR_MASKED_LM_MAPPING", "MODEL_FOR_MULTIPLE_CHOICE_MAPPING", @@ -41,6 +42,7 @@ if is_torch_available(): "MODEL_FOR_QUESTION_ANSWERING_MAPPING", "MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING", "MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING", + "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING", "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING", "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING", "MODEL_MAPPING", @@ -48,6 +50,7 @@ if is_torch_available(): "AutoModel", "AutoModelForAudioClassification", "AutoModelForCausalLM", + "AutoModelForCTC", "AutoModelForImageClassification", "AutoModelForMaskedLM", "AutoModelForMultipleChoice", @@ -57,6 +60,7 @@ if is_torch_available(): "AutoModelForQuestionAnswering", "AutoModelForSeq2SeqLM", "AutoModelForSequenceClassification", + "AutoModelForSpeechSeq2Seq", "AutoModelForTableQuestionAnswering", "AutoModelForTokenClassification", "AutoModelWithLMHead", @@ -124,6 +128,7 @@ if TYPE_CHECKING: from .modeling_auto import ( MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING, MODEL_FOR_CAUSAL_LM_MAPPING, + MODEL_FOR_CTC_MAPPING, MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, MODEL_FOR_MASKED_LM_MAPPING, MODEL_FOR_MULTIPLE_CHOICE_MAPPING, @@ -133,6 +138,7 @@ if TYPE_CHECKING: MODEL_FOR_QUESTION_ANSWERING_MAPPING, MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, + MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING, MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, MODEL_MAPPING, @@ -140,6 +146,7 @@ if TYPE_CHECKING: AutoModel, AutoModelForAudioClassification, AutoModelForCausalLM, + AutoModelForCTC, AutoModelForImageClassification, AutoModelForMaskedLM, AutoModelForMultipleChoice, @@ -149,6 +156,7 @@ if TYPE_CHECKING: AutoModelForQuestionAnswering, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, + AutoModelForSpeechSeq2Seq, AutoModelForTableQuestionAnswering, AutoModelForTokenClassification, AutoModelWithLMHead, diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index eb971c96898..e246728d680 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -291,6 +291,13 @@ MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict( ] ) +MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict( + [ + ("speech-encoder-decoder", "SpeechEncoderDecoderModel"), + ("speech_to_text", "Speech2TextForConditionalGeneration"), + ] +) + MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict( [ # Model for Sequence Classification mapping @@ -462,6 +469,14 @@ MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict( ] ) +MODEL_FOR_CTC_MAPPING_NAMES = OrderedDict( + [ + # Model for Connectionist temporal classification (CTC) mapping + ("wav2vec2", "Wav2Vec2ForCTC"), + ("hubert", "HubertForCTC"), + ] +) + MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES) MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_PRETRAINING_MAPPING_NAMES) MODEL_WITH_LM_HEAD_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_WITH_LM_HEAD_MAPPING_NAMES) @@ -493,6 +508,8 @@ MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING = _LazyAutoMapping( MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES ) +MODEL_FOR_CTC_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_CTC_MAPPING_NAMES) +MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES) class AutoModel(_BaseAutoModelClass): @@ -611,6 +628,22 @@ class AutoModelForAudioClassification(_BaseAutoModelClass): AutoModelForAudioClassification = auto_class_update(AutoModelForAudioClassification, head_doc="audio classification") +class AutoModelForCTC(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_CTC_MAPPING + + +AutoModelForCTC = auto_class_update(AutoModelForCTC, head_doc="connectionist temporal classification") + + +class AutoModelForSpeechSeq2Seq(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING + + +AutoModelForSpeechSeq2Seq = auto_class_update( + AutoModelForSpeechSeq2Seq, head_doc="sequence-to-sequence speech-to-text modeing" +) + + class AutoModelWithLMHead(_AutoModelWithLMHead): @classmethod def from_config(cls, config): diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py index ad2040895d7..3d46a372dcf 100755 --- a/src/transformers/pipelines/__init__.py +++ b/src/transformers/pipelines/__init__.py @@ -90,12 +90,14 @@ if is_torch_available(): AutoModel, AutoModelForAudioClassification, AutoModelForCausalLM, + AutoModelForCTC, AutoModelForImageClassification, AutoModelForMaskedLM, AutoModelForObjectDetection, AutoModelForQuestionAnswering, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, + AutoModelForSpeechSeq2Seq, AutoModelForTableQuestionAnswering, AutoModelForTokenClassification, ) @@ -121,9 +123,7 @@ SUPPORTED_TASKS = { "automatic-speech-recognition": { "impl": AutomaticSpeechRecognitionPipeline, "tf": (), - # Only load from `config.architectures`, AutoModelForCTC and AutoModelForConditionalGeneration - # do not exist yet. - "pt": () if is_torch_available() else (), + "pt": (AutoModelForCTC, AutoModelForSpeechSeq2Seq) if is_torch_available() else (), "default": {"model": {"pt": "facebook/wav2vec2-base-960h"}}, }, "feature-extraction": { diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py index 35bde41d62c..9dc6fa58b13 100644 --- a/src/transformers/pipelines/automatic_speech_recognition.py +++ b/src/transformers/pipelines/automatic_speech_recognition.py @@ -16,6 +16,7 @@ from typing import TYPE_CHECKING, Union import numpy as np +from ..file_utils import is_torch_available from ..utils import logging from .base import Pipeline @@ -25,6 +26,9 @@ if TYPE_CHECKING: logger = logging.get_logger(__name__) +if is_torch_available(): + from ..models.auto.modeling_auto import MODEL_FOR_CTC_MAPPING, MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING + def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array: """ @@ -102,6 +106,8 @@ class AutomaticSpeechRecognitionPipeline(Pipeline): if self.framework == "tf": raise ValueError("The AutomaticSpeechRecognitionPipeline is only available in PyTorch.") + self.check_model_type(MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING.items() + MODEL_FOR_CTC_MAPPING.items()) + def __call__( self, inputs: Union[np.ndarray, bytes, str], @@ -149,8 +155,8 @@ class AutomaticSpeechRecognitionPipeline(Pipeline): return processed def _forward(self, model_inputs): - name = self.model.__class__.__name__ - if name.endswith("ForConditionalGeneration") or name.endswith("EncoderDecoderModel"): + model_class = self.model.__class__ + if model_class in MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING.values(): encoder = self.model.get_encoder() # we need to pass `processed.get("attention_mask")` here since audio encoder # attention mask length is different from expected text decoder `encoder_attention_mask` length @@ -160,7 +166,7 @@ class AutomaticSpeechRecognitionPipeline(Pipeline): encoder_outputs=encoder(**model_inputs), attention_mask=model_inputs.get("attention_mask") ) tokens = tokens.squeeze(0) - elif name.endswith("ForCTC"): + elif model_class in MODEL_FOR_CTC_MAPPING.values(): outputs = self.model(**model_inputs) tokens = outputs.logits.squeeze(0).argmax(dim=-1) return tokens diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 59dcd850f72..fc4f28b4a36 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -379,6 +379,15 @@ class AutoModelForCausalLM: requires_backends(cls, ["torch"]) +class AutoModelForCTC: + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + class AutoModelForImageClassification: def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) @@ -460,6 +469,15 @@ class AutoModelForSequenceClassification: requires_backends(cls, ["torch"]) +class AutoModelForSpeechSeq2Seq: + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + class AutoModelForTableQuestionAnswering: def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) diff --git a/tests/test_pipelines_automatic_speech_recognition.py b/tests/test_pipelines_automatic_speech_recognition.py index 4ee03271f4a..7deec451379 100644 --- a/tests/test_pipelines_automatic_speech_recognition.py +++ b/tests/test_pipelines_automatic_speech_recognition.py @@ -49,10 +49,10 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): @require_torch def test_torch_small_no_tokenizer_files(self): # test that model without tokenizer file cannot be loaded - with pytest.raises(ValueError): + with pytest.raises(OSError): pipeline( task="automatic-speech-recognition", - model="hf-internal-testing/tiny-random-wav2vec2", + model="patrickvonplaten/tiny-wav2vec2-no-tokenizer", framework="pt", )