diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index e9d774b562e..872121b4432 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -610,6 +610,7 @@ class WhisperEncoder(WhisperPreTrainedModel): def forward( self, input_features, + attention_mask=None, head_mask=None, output_attentions=None, output_hidden_states=None, @@ -624,12 +625,14 @@ class WhisperEncoder(WhisperPreTrainedModel): `input_features`, the [`WhisperFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] + attention_mask (`torch.Tensor`)`, *optional*): + Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, + but it is not used. By default the silence in the input log mel spectrogram are ignored. head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. - output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py index c6bf4c09581..04e36410ce9 100644 --- a/src/transformers/pipelines/automatic_speech_recognition.py +++ b/src/transformers/pipelines/automatic_speech_recognition.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import inspect from collections import defaultdict from typing import TYPE_CHECKING, Dict, Optional, Union @@ -289,10 +288,6 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline): is_last = model_inputs.pop("is_last") if self.type == "seq2seq": encoder = self.model.get_encoder() - # we need to pass `processed.get("attention_mask")` here since audio encoder - # attention mask length is different from expected text decoder `encoder_attention_mask` length - # `generate` magic to create the mask automatically won't work, we basically need to help - # it here. # Consume values so we can let extra information flow freely through # the pipeline (important for `partial` in microphone) if "input_features" in model_inputs: @@ -305,15 +300,15 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline): f"`input_features` or `input_values` key, but only has {model_inputs.keys()}" ) - accepts_attention_mask = "attention_mask" in set(inspect.signature(encoder.forward).parameters.keys()) - if accepts_attention_mask: - attention_mask = model_inputs.pop("attention_mask", None) - tokens = self.model.generate( - encoder_outputs=encoder(inputs, attention_mask=attention_mask), - attention_mask=attention_mask, - ) - else: - tokens = self.model.generate(inputs) + # we need to pass `processed.get("attention_mask")` here since audio encoder + # attention mask length is different from expected text decoder `encoder_attention_mask` length + # `generate` magic to create the mask automatically won't work, we basically need to help + # it here. + attention_mask = model_inputs.pop("attention_mask", None) + tokens = self.model.generate( + encoder_outputs=encoder(inputs, attention_mask=attention_mask), + attention_mask=attention_mask, + ) out = {"tokens": tokens}