diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py index b1e47131c75..3b04ccbb79a 100644 --- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py @@ -1213,7 +1213,10 @@ class TFWav2Vec2MainLayer(tf.keras.layers.Layer): if inputs["attention_mask"] is not None: # compute real output lengths according to convolution formula output_lengths = self._get_feat_extract_output_lengths(tf.reduce_sum(inputs["attention_mask"], -1)) - attention_mask = tf.sequence_mask(output_lengths, dtype=hidden_states.dtype) + + attention_mask = tf.sequence_mask( + output_lengths, maxlen=shape_list(hidden_states)[1], dtype=hidden_states.dtype + ) hidden_states = self.feature_projection(hidden_states, training=inputs["training"])