diff --git a/src/transformers/models/minimax_text_01/__init__.py b/src/transformers/models/minimax_text_01/__init__.py
index 1d65a515cf1..2f92703446d 100644
--- a/src/transformers/models/minimax_text_01/__init__.py
+++ b/src/transformers/models/minimax_text_01/__init__.py
@@ -15,54 +15,15 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import (
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    is_torch_available,
-)
-
-
-_import_structure = {
-    "configuration_minimax_text_01": ["MiniMaxText01Config"],
-}
-
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_minimax_text_01"] = [
-        "MiniMaxText01ForCausalLM",
-        "MiniMaxText01ForQuestionAnswering",
-        "MiniMaxText01Model",
-        "MiniMaxText01PreTrainedModel",
-        "MiniMaxText01ForSequenceClassification",
-        "MiniMaxText01ForTokenClassification",
-    ]
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
 
 
 if TYPE_CHECKING:
-    from .configuration_minimax_text_01 import MiniMaxText01Config
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_minimax_text_01 import (
-            MiniMaxText01ForCausalLM,
-            MiniMaxText01ForQuestionAnswering,
-            MiniMaxText01ForSequenceClassification,
-            MiniMaxText01ForTokenClassification,
-            MiniMaxText01Model,
-            MiniMaxText01PreTrainedModel,
-        )
-
-
+    from .configuration_minimax_text_01 import *
+    from .modeling_minimax_text_01 import *
 else:
     import sys
 
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/minimax_text_01/modeling_minimax_text_01.py b/src/transformers/models/minimax_text_01/modeling_minimax_text_01.py
index 9e33a18a58a..b0d01400a6b 100644
--- a/src/transformers/models/minimax_text_01/modeling_minimax_text_01.py
+++ b/src/transformers/models/minimax_text_01/modeling_minimax_text_01.py
@@ -259,13 +259,6 @@ def eager_attention_forward(
         causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
         attn_weights = attn_weights + causal_mask
 
-    # print()
-    # ic(module.layer_idx)
-    # show_tensor(query, False, True)
-    # show_tensor(key_states, False, True)
-    # show_tensor(value_states, False, True)
-    # show_tensor(attn_weights, False, True)
-
     attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
     attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
     attn_output = torch.matmul(attn_weights, value_states)
@@ -310,23 +303,11 @@ class MiniMaxText01Attention(nn.Module):
         cos, sin = position_embeddings
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
-        # print(self.layer_idx)
-        # show_tensor(query_states, end=False, only_shapes=False)
-        # show_tensor(key_states, end=False, only_shapes=True)
-        # show_tensor(value_states, end=True, only_shapes=True)
-
-        # print()
-        # print()
-        # ic(self.layer_idx)
-        # show_tensor(key_states, False, True)
-
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
-        # show_tensor(key_states, False, True)
-
         attention_interface: Callable = eager_attention_forward
         if self.config._attn_implementation != "eager":
             if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
@@ -351,10 +332,6 @@ class MiniMaxText01Attention(nn.Module):
 
         attn_output = attn_output.reshape(*input_shape, -1).contiguous()
         attn_output = self.o_proj(attn_output)
-
-        # ic(self.layer_idx)
-        # show_tensor(attn_output, False, True)
-
         return attn_output, attn_weights
 
 
@@ -592,7 +569,7 @@ class MiniMaxText01RotaryEmbedding(nn.Module):
         2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
         """
         seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth_dynamic_frequency_update
+        if seq_len > self.max_seq_len_cached:  # growth
             inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
@@ -628,7 +605,7 @@ class MiniMaxText01RotaryEmbedding(nn.Module):
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
 
-MINI_MAX_TEXT01_START_DOCSTRING = r"""
+MINIMAX_TEXT_01_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
@@ -647,7 +624,7 @@ MINI_MAX_TEXT01_START_DOCSTRING = r"""
 
 @add_start_docstrings(
     "The bare MiniMaxText01 Model outputting raw hidden-states without any specific head on top.",
-    MINI_MAX_TEXT01_START_DOCSTRING,
+    MINIMAX_TEXT_01_START_DOCSTRING,
 )
 class MiniMaxText01PreTrainedModel(PreTrainedModel):
     config_class = MiniMaxText01Config
@@ -674,7 +651,7 @@ class MiniMaxText01PreTrainedModel(PreTrainedModel):
                 module.weight.data[module.padding_idx].zero_()
 
 
-MINI_MAX_TEXT01_INPUTS_DOCSTRING = r"""
+MINIMAX_TEXT_01_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -751,7 +728,7 @@ MINI_MAX_TEXT01_INPUTS_DOCSTRING = r"""
 
 @add_start_docstrings(
     "The bare MiniMaxText01 Model outputting raw hidden-states without any specific head on top.",
-    MINI_MAX_TEXT01_START_DOCSTRING,
+    MINIMAX_TEXT_01_START_DOCSTRING,
 )
 class MiniMaxText01Model(MiniMaxText01PreTrainedModel):
     """
@@ -783,7 +760,7 @@ class MiniMaxText01Model(MiniMaxText01PreTrainedModel):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
-    @add_start_docstrings_to_model_forward(MINI_MAX_TEXT01_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(MINIMAX_TEXT_01_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -820,7 +797,6 @@ class MiniMaxText01Model(MiniMaxText01PreTrainedModel):
                 )
                 use_cache = False
 
-        # TODO: raise exception here?
         if use_cache and past_key_values is None:
             past_key_values = DynamicCache()
 
@@ -1173,7 +1149,7 @@ class MiniMaxText01ForCausalLM(MiniMaxText01PreTrainedModel, GenerationMixin):
     def get_decoder(self):
         return self.model
 
-    @add_start_docstrings_to_model_forward(MINI_MAX_TEXT01_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(MINIMAX_TEXT_01_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -1222,7 +1198,6 @@ class MiniMaxText01ForCausalLM(MiniMaxText01PreTrainedModel, GenerationMixin):
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
-        # ic(input_ids.shape, input_ids)
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_router_logits = (
@@ -1299,7 +1274,7 @@ class MiniMaxText01ForCausalLM(MiniMaxText01PreTrainedModel, GenerationMixin):
     padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
     each row of the batch).
     """,
-    MINI_MAX_TEXT01_START_DOCSTRING,
+    MINIMAX_TEXT_01_START_DOCSTRING,
 )
 class MiniMaxText01ForSequenceClassification(MiniMaxText01PreTrainedModel):
     def __init__(self, config):
@@ -1317,7 +1292,7 @@ class MiniMaxText01ForSequenceClassification(MiniMaxText01PreTrainedModel):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
-    @add_start_docstrings_to_model_forward(MINI_MAX_TEXT01_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(MINIMAX_TEXT_01_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -1395,7 +1370,7 @@ class MiniMaxText01ForSequenceClassification(MiniMaxText01PreTrainedModel):
     The MiniMaxText01 Model transformer with a token classification head on top (a linear layer on top of the hidden-states
     output) e.g. for Named-Entity-Recognition (NER) tasks.
     """,
-    MINI_MAX_TEXT01_START_DOCSTRING,
+    MINIMAX_TEXT_01_START_DOCSTRING,
 )
 class MiniMaxText01ForTokenClassification(MiniMaxText01PreTrainedModel):
     def __init__(self, config):
@@ -1420,7 +1395,7 @@ class MiniMaxText01ForTokenClassification(MiniMaxText01PreTrainedModel):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
-    @add_start_docstrings_to_model_forward(MINI_MAX_TEXT01_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(MINIMAX_TEXT_01_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
@@ -1483,7 +1458,7 @@ class MiniMaxText01ForTokenClassification(MiniMaxText01PreTrainedModel):
 The MiniMaxText01 Model transformer with a span classification head on top for extractive question-answering tasks like
 SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
     """,
-    MINI_MAX_TEXT01_START_DOCSTRING,
+    MINIMAX_TEXT_01_START_DOCSTRING,
 )
 class MiniMaxText01ForQuestionAnswering(MiniMaxText01PreTrainedModel):
     base_model_prefix = "model"
@@ -1502,7 +1477,7 @@ class MiniMaxText01ForQuestionAnswering(MiniMaxText01PreTrainedModel):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
-    @add_start_docstrings_to_model_forward(MINI_MAX_TEXT01_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(MINIMAX_TEXT_01_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,