Removed unused encoder_hidden_states and encoder_attention_mask (#8972)

* Removed unused `encoder_hidden_states` and `encoder_attention_mask` from MobileBert * Removed decoder tests for MobileBert * Removed now unnecessary import
2025-07-31 02:02:21 +06:00 · 2020-12-08 18:04:34 +01:00 · 2020-12-08 18:04:34 +01:00 · 7809eb82ae
commit 7809eb82ae
parent b7cdd00f15
2 changed files with 1 additions and 134 deletions
--- a/src/transformers/models/mobilebert/modeling_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_mobilebert.py
@ -251,8 +251,6 @@ class MobileBertSelfAttention(nn.Module):
        value_tensor,
        attention_mask=None,
        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
        output_attentions=None,
    ):
        mixed_query_layer = self.query(query_tensor)
@ -335,8 +333,6 @@ class MobileBertAttention(nn.Module):
        layer_input,
        attention_mask=None,
        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
        output_attentions=None,
    ):
        self_outputs = self.self(
@ -345,8 +341,6 @@ class MobileBertAttention(nn.Module):
            value_tensor,
            attention_mask,
            head_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
            output_attentions,
        )
        # Run a linear projection of `hidden_size` then add a residual
@ -498,8 +492,6 @@ class MobileBertLayer(nn.Module):
        hidden_states,
        attention_mask=None,
        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
        output_attentions=None,
    ):
        if self.use_bottleneck:
@ -554,8 +546,6 @@ class MobileBertEncoder(nn.Module):
        hidden_states,
        attention_mask=None,
        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
@ -570,8 +560,6 @@ class MobileBertEncoder(nn.Module):
                hidden_states,
                attention_mask,
                head_mask[i],
-                encoder_hidden_states,
-                encoder_attention_mask,
                output_attentions,
            )
            hidden_states = layer_outputs[0]
@ -783,16 +771,6 @@ MOBILEBERT_INPUTS_DOCSTRING = r"""
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
            vectors than the model's internal embedding lookup matrix.
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
        output_attentions (:obj:`bool`, `optional`):
            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
            tensors for more detail.
@ -852,8 +830,6 @@ class MobileBertModel(MobileBertPreTrainedModel):
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
        output_hidden_states=None,
        output_attentions=None,
        return_dict=None,
@ -886,17 +862,6 @@ class MobileBertModel(MobileBertPreTrainedModel):
            attention_mask, input_shape, self.device
        )

-        # If a 2D ou 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if self.config.is_decoder and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
-            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
-            if encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
-            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
-        else:
-            encoder_extended_attention_mask = None
-
        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
@ -911,8 +876,6 @@ class MobileBertModel(MobileBertPreTrainedModel):
            embedding_output,
            attention_mask=extended_attention_mask,
            head_mask=head_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_extended_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
@ -1083,8 +1046,6 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
        head_mask=None,
        inputs_embeds=None,
        labels=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
@ -1104,8 +1065,6 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
--- a/tests/test_modeling_mobilebert.py
+++ b/tests/test_modeling_mobilebert.py
@ -20,7 +20,7 @@ from transformers import is_torch_available
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device

 from .test_configuration_common import ConfigTester
-from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask


 if is_torch_available():
@ -128,33 +128,6 @@ class MobileBertModelTester:

        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels

-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
    def create_and_check_mobilebert_model(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
@ -168,39 +141,6 @@ class MobileBertModelTester:
        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))

-    def create_and_check_mobilebert_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = MobileBertModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
    def create_and_check_mobilebert_for_masked_lm(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
@ -352,38 +292,6 @@ class MobileBertModelTest(ModelTesterMixin, unittest.TestCase):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_mobilebert_model(*config_and_inputs)

-    def test_mobilebert_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_mobilebert_model_as_decoder(*config_and_inputs)
-
-    def test_mobilebert_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_mobilebert_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
    def test_for_masked_lm(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_mobilebert_for_masked_lm(*config_and_inputs)