From e830495c1ca7fa12653b6dec6ffe0b244ac4dc1d Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Mon, 30 Oct 2023 18:52:24 +0800 Subject: [PATCH] Fix data2vec-audio note about attention mask (#27116) fix data2vec audio note about attention mask --- .../models/data2vec/modeling_data2vec_audio.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py index cf15d8508d5..47cf2d6245e 100755 --- a/src/transformers/models/data2vec/modeling_data2vec_audio.py +++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py @@ -786,12 +786,11 @@ DATA2VEC_AUDIO_INPUTS_DOCSTRING = r""" - `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask == - True`. For all models whose processor has `config.return_attention_mask == False`, such as - [data2vec-audio-base](https://huggingface.co/facebook/data2vec-audio-base-960h), `attention_mask` should - **not** be passed to avoid degraded performance when doing batched inference. For such models - `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware that these - models also yield slightly different results depending on whether `input_values` is padded or not. + `attention_mask` should be passed if the corresponding processor has `config.return_attention_mask == + True`, which is the case for all pre-trained Data2Vec Audio models. Be aware that that even with + `attention_mask`, zero-padded inputs will have slightly different outputs compared to non-padded inputs + because there are more than one convolutional layer in the positional encodings. For a more detailed + explanation, see [here](https://github.com/huggingface/transformers/issues/25621#issuecomment-1713759349).