use latest __init__ standards and auto-generate modular

2025-07-03 04:40:06 +06:00 · 2025-01-27 07:48:21 +05:00 · 2025-01-27 07:48:21 +05:00 · d8d3c409d8
commit d8d3c409d8
parent c54f8045ec
2 changed files with 19 additions and 83 deletions
--- a/src/transformers/models/minimax_text_01/init.py
+++ b/src/transformers/models/minimax_text_01/init.py
@ -15,54 +15,15 @@
 # limitations under the License.
 from typing import TYPE_CHECKING

-from ...utils import (
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    is_torch_available,
-)
-
-
-_import_structure = {
-    "configuration_minimax_text_01": ["MiniMaxText01Config"],
-}
-
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_minimax_text_01"] = [
-        "MiniMaxText01ForCausalLM",
-        "MiniMaxText01ForQuestionAnswering",
-        "MiniMaxText01Model",
-        "MiniMaxText01PreTrainedModel",
-        "MiniMaxText01ForSequenceClassification",
-        "MiniMaxText01ForTokenClassification",
-    ]
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure


 if TYPE_CHECKING:
-    from .configuration_minimax_text_01 import MiniMaxText01Config
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_minimax_text_01 import (
-            MiniMaxText01ForCausalLM,
-            MiniMaxText01ForQuestionAnswering,
-            MiniMaxText01ForSequenceClassification,
-            MiniMaxText01ForTokenClassification,
-            MiniMaxText01Model,
-            MiniMaxText01PreTrainedModel,
-        )
-
-
+    from .configuration_minimax_text_01 import *
+    from .modeling_minimax_text_01 import *
 else:
    import sys

-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
--- a/src/transformers/models/minimax_text_01/modeling_minimax_text_01.py
+++ b/src/transformers/models/minimax_text_01/modeling_minimax_text_01.py
@ -259,13 +259,6 @@ def eager_attention_forward(
        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
        attn_weights = attn_weights + causal_mask

-    # print()
-    # ic(module.layer_idx)
-    # show_tensor(query, False, True)
-    # show_tensor(key_states, False, True)
-    # show_tensor(value_states, False, True)
-    # show_tensor(attn_weights, False, True)
-
    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
    attn_output = torch.matmul(attn_weights, value_states)
@ -310,23 +303,11 @@ class MiniMaxText01Attention(nn.Module):
        cos, sin = position_embeddings
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

-        # print(self.layer_idx)
-        # show_tensor(query_states, end=False, only_shapes=False)
-        # show_tensor(key_states, end=False, only_shapes=True)
-        # show_tensor(value_states, end=True, only_shapes=True)
-
-        # print()
-        # print()
-        # ic(self.layer_idx)
-        # show_tensor(key_states, False, True)
-
        if past_key_value is not None:
            # sin and cos are specific to RoPE models; cache_position needed for the static cache
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

-        # show_tensor(key_states, False, True)
-
        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
@ -351,10 +332,6 @@ class MiniMaxText01Attention(nn.Module):

        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
        attn_output = self.o_proj(attn_output)
-
-        # ic(self.layer_idx)
-        # show_tensor(attn_output, False, True)
-
        return attn_output, attn_weights


@ -592,7 +569,7 @@ class MiniMaxText01RotaryEmbedding(nn.Module):
        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
        """
        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth_dynamic_frequency_update
+        if seq_len > self.max_seq_len_cached:  # growth
            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
            self.max_seq_len_cached = seq_len
@ -628,7 +605,7 @@ class MiniMaxText01RotaryEmbedding(nn.Module):
        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


-MINI_MAX_TEXT01_START_DOCSTRING = r"""
+MINIMAX_TEXT_01_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)
@ -647,7 +624,7 @@ MINI_MAX_TEXT01_START_DOCSTRING = r"""

@add_start_docstrings(
    "The bare MiniMaxText01 Model outputting raw hidden-states without any specific head on top.",
-    MINI_MAX_TEXT01_START_DOCSTRING,
+    MINIMAX_TEXT_01_START_DOCSTRING,
 )
 class MiniMaxText01PreTrainedModel(PreTrainedModel):
    config_class = MiniMaxText01Config
@ -674,7 +651,7 @@ class MiniMaxText01PreTrainedModel(PreTrainedModel):
                module.weight.data[module.padding_idx].zero_()


-MINI_MAX_TEXT01_INPUTS_DOCSTRING = r"""
+MINIMAX_TEXT_01_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@ -751,7 +728,7 @@ MINI_MAX_TEXT01_INPUTS_DOCSTRING = r"""

@add_start_docstrings(
    "The bare MiniMaxText01 Model outputting raw hidden-states without any specific head on top.",
-    MINI_MAX_TEXT01_START_DOCSTRING,
+    MINIMAX_TEXT_01_START_DOCSTRING,
 )
 class MiniMaxText01Model(MiniMaxText01PreTrainedModel):
    """
@ -783,7 +760,7 @@ class MiniMaxText01Model(MiniMaxText01PreTrainedModel):
    def set_input_embeddings(self, value):
        self.embed_tokens = value

-    @add_start_docstrings_to_model_forward(MINI_MAX_TEXT01_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(MINIMAX_TEXT_01_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
@ -820,7 +797,6 @@ class MiniMaxText01Model(MiniMaxText01PreTrainedModel):
                )
                use_cache = False

-        # TODO: raise exception here?
        if use_cache and past_key_values is None:
            past_key_values = DynamicCache()

@ -1173,7 +1149,7 @@ class MiniMaxText01ForCausalLM(MiniMaxText01PreTrainedModel, GenerationMixin):
    def get_decoder(self):
        return self.model

-    @add_start_docstrings_to_model_forward(MINI_MAX_TEXT01_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(MINIMAX_TEXT_01_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
@ -1222,7 +1198,6 @@ class MiniMaxText01ForCausalLM(MiniMaxText01PreTrainedModel, GenerationMixin):
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```"""
-        # ic(input_ids.shape, input_ids)

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_router_logits = (
@ -1299,7 +1274,7 @@ class MiniMaxText01ForCausalLM(MiniMaxText01PreTrainedModel, GenerationMixin):
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """,
-    MINI_MAX_TEXT01_START_DOCSTRING,
+    MINIMAX_TEXT_01_START_DOCSTRING,
 )
 class MiniMaxText01ForSequenceClassification(MiniMaxText01PreTrainedModel):
    def __init__(self, config):
@ -1317,7 +1292,7 @@ class MiniMaxText01ForSequenceClassification(MiniMaxText01PreTrainedModel):
    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

-    @add_start_docstrings_to_model_forward(MINI_MAX_TEXT01_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(MINIMAX_TEXT_01_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
@ -1395,7 +1370,7 @@ class MiniMaxText01ForSequenceClassification(MiniMaxText01PreTrainedModel):
    The MiniMaxText01 Model transformer with a token classification head on top (a linear layer on top of the hidden-states
    output) e.g. for Named-Entity-Recognition (NER) tasks.
    """,
-    MINI_MAX_TEXT01_START_DOCSTRING,
+    MINIMAX_TEXT_01_START_DOCSTRING,
 )
 class MiniMaxText01ForTokenClassification(MiniMaxText01PreTrainedModel):
    def __init__(self, config):
@ -1420,7 +1395,7 @@ class MiniMaxText01ForTokenClassification(MiniMaxText01PreTrainedModel):
    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

-    @add_start_docstrings_to_model_forward(MINI_MAX_TEXT01_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(MINIMAX_TEXT_01_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
@ -1483,7 +1458,7 @@ class MiniMaxText01ForTokenClassification(MiniMaxText01PreTrainedModel):
 The MiniMaxText01 Model transformer with a span classification head on top for extractive question-answering tasks like
 SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
-    MINI_MAX_TEXT01_START_DOCSTRING,
+    MINIMAX_TEXT_01_START_DOCSTRING,
 )
 class MiniMaxText01ForQuestionAnswering(MiniMaxText01PreTrainedModel):
    base_model_prefix = "model"
@ -1502,7 +1477,7 @@ class MiniMaxText01ForQuestionAnswering(MiniMaxText01PreTrainedModel):
    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

-    @add_start_docstrings_to_model_forward(MINI_MAX_TEXT01_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(MINIMAX_TEXT_01_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,