From c54f8045ec7ca36e01e0f469a53f6e7177bbfb40 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Fri, 24 Jan 2025 20:13:31 +0500 Subject: [PATCH] put minimax_text_01 in other files --- docs/source/ar/_toctree.yml | 2 + docs/source/ar/conversations.md | 2 +- docs/source/ar/index.md | 1 + docs/source/ar/trainer.md | 2 +- docs/source/en/_toctree.yml | 2 + docs/source/en/conversations.md | 2 +- docs/source/en/index.md | 1 + docs/source/en/perf_infer_gpu_one.md | 3 ++ docs/source/ko/conversations.md | 2 +- docs/source/zh/index.md | 1 + src/transformers/__init__.py | 20 +++++++++ src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 2 + src/transformers/models/auto/modeling_auto.py | 5 +++ .../models/auto/tokenization_auto.py | 7 ++++ src/transformers/utils/dummy_pt_objects.py | 42 +++++++++++++++++++ src/transformers/utils/fx.py | 1 + .../modular/test_conversion_order.py | 2 + tests/test_modeling_common.py | 2 +- utils/not_doctested.txt | 3 ++ 20 files changed, 98 insertions(+), 5 deletions(-) diff --git a/docs/source/ar/_toctree.yml b/docs/source/ar/_toctree.yml index 30e247eb54e..f09cba37265 100644 --- a/docs/source/ar/_toctree.yml +++ b/docs/source/ar/_toctree.yml @@ -454,6 +454,8 @@ # title: Mistral # - local: model_doc/mixtral # title: Mixtral +# - local: model_doc/minimax_text_01 +# title: MiniMaxText01 # - local: model_doc/mluke # title: mLUKE # - local: model_doc/mobilebert diff --git a/docs/source/ar/conversations.md b/docs/source/ar/conversations.md index 00e6fe814ea..dffb1836de2 100644 --- a/docs/source/ar/conversations.md +++ b/docs/source/ar/conversations.md @@ -201,4 +201,4 @@ pipe = pipeline("text-generation", "meta-llama/Meta-Llama-3-8B-Instruct", device لذلك، إذا كنت تريد تحسين سرعة توليد النص، فإن الحل الأسهل هو إما تقليل حجم النموذج في الذاكرة (عادةً عن طريق التكميم)، أو الحصول على عتاد بسرعة أكبر في الذاكرة. بالنسبة للمستخدمين المتقدمين، هناك عدة تقنيات أخرى للتغلب على هذه القيود. الأكثر شيوعًا هي المتغيرات على [التوليد بمساعدة](https://huggingface.co/blog/assisted-generation)، المعروف أيضًا باسم "العينات التخمينية (speculative sampling)". تحاول هذه التقنيات تخمين عدة رموز مستقبلية في وقت واحد، غالبًا باستخدام نموذج "مسودة (draft model)" أصغر، ثم تأكيد هذه التوليدات باستخدام نموذج الدردشة. إذا تم التحقق من صحة التخمينات بواسطة نموذج الدردشة، فيمكن إنشاء أكثر من رمز واحد لكل تمرير للأمام، مما يخفف بشكل كبير من القيود المتعلقة بالسعة ويحسن سرعة التوليد. -أخيرًا، يجب أن نلاحظ أيضًا تأثير نماذج "مزيج الخبراء" "Mixture of Experts" (MoE) هنا. العديد من نماذج المحادثة الشهيرة، مثل Mixtral وQwen-MoE وDBRX، هي نماذج MoE. في هذه النماذج، لا تكون كل معلمة نشطة لكل رمز يتم إنشاؤه. ونتيجة لذلك، فإن نماذج MoE لديها عمومًا متطلبات ذاكرة أقل بكثير، على الرغم من أن حجمها الإجمالي يمكن أن يكون كبيرًا جدًا. لذلك يمكن أن تكون أسرع عدة مرات من نموذج "كثيف" عادي بنفس الحجم. ومع ذلك، فإن التقنيات مثل التوليد المساعد غير فعالة بشكل عام لهذه النماذج لأن المزيد من المعلمات ستصبح نشطة مع كل رمز جديد يتم التكهن به، والذي سيبطل فوائد السعة والسرعة التي توفرها بنية MoE. \ No newline at end of file +أخيرًا، يجب أن نلاحظ أيضًا تأثير نماذج "مزيج الخبراء" "Mixture of Experts" (MoE) هنا. العديد من نماذج المحادثة الشهيرة، مثل Mixtral وMiniMaxText01وQwen-MoE وDBRX، هي نماذج MoE. في هذه النماذج، لا تكون كل معلمة نشطة لكل رمز يتم إنشاؤه. ونتيجة لذلك، فإن نماذج MoE لديها عمومًا متطلبات ذاكرة أقل بكثير، على الرغم من أن حجمها الإجمالي يمكن أن يكون كبيرًا جدًا. لذلك يمكن أن تكون أسرع عدة مرات من نموذج "كثيف" عادي بنفس الحجم. ومع ذلك، فإن التقنيات مثل التوليد المساعد غير فعالة بشكل عام لهذه النماذج لأن المزيد من المعلمات ستصبح نشطة مع كل رمز جديد يتم التكهن به، والذي سيبطل فوائد السعة والسرعة التي توفرها بنية MoE. \ No newline at end of file diff --git a/docs/source/ar/index.md b/docs/source/ar/index.md index c37dbd1c6d9..299245d789f 100644 --- a/docs/source/ar/index.md +++ b/docs/source/ar/index.md @@ -196,6 +196,7 @@ | [MGP-STR](model_doc/mgp-str) | ✅ | ❌ | ❌ | | [Mistral](model_doc/mistral) | ✅ | ✅ | ✅ | | [Mixtral](model_doc/mixtral) | ✅ | ❌ | ❌ | +| [MiniMaxText01](model_doc/minimax_text_01) | ✅ | ❌ | ❌ | | [mLUKE](model_doc/mluke) | ✅ | ❌ | ❌ | | [MMS](model_doc/mms) | ✅ | ✅ | ✅ | | [MobileBERT](model_doc/mobilebert) | ✅ | ✅ | ❌ | diff --git a/docs/source/ar/trainer.md b/docs/source/ar/trainer.md index 7da7cbf4e17..ddf62a5fe0d 100644 --- a/docs/source/ar/trainer.md +++ b/docs/source/ar/trainer.md @@ -265,7 +265,7 @@ training_args = TrainingArguments( ) ``` -تدعم النواة معماريات نماذج Llama و Gemma و Mistral و Mixtral. يُمكن العثور على أحدث قائمة بالنمائج المدعومة [هنا](https://github.com/linkedin/Liger-Kernel). عندما يتم تعيين `use_liger_kernel` إلى `True`، سيتم تصحيح الطبقات المُقابلة في النموذج الأصلي باستخدام تطبيق Liger الفعال، لذلك لا تحتاج إلى فعل أي شيء إضافي بخلاف تعيين قيمة المعامل. +تدعم النواة معماريات نماذج Llama و Gemma و Mistral و Mixtralو MiniMaxText01. يُمكن العثور على أحدث قائمة بالنمائج المدعومة [هنا](https://github.com/linkedin/Liger-Kernel). عندما يتم تعيين `use_liger_kernel` إلى `True`، سيتم تصحيح الطبقات المُقابلة في النموذج الأصلي باستخدام تطبيق Liger الفعال، لذلك لا تحتاج إلى فعل أي شيء إضافي بخلاف تعيين قيمة المعامل. ## المُحسِّنات يمكنك اختيار مُحسِّن مدمج للتدريب باستخدام: diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 3e8bcd9ece1..dfca7a06d52 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -498,6 +498,8 @@ title: Mistral - local: model_doc/mixtral title: Mixtral + - local: model_doc/minimax_text_01 + title: MiniMaxText01 - local: model_doc/mluke title: mLUKE - local: model_doc/mobilebert diff --git a/docs/source/en/conversations.md b/docs/source/en/conversations.md index a48c046b494..21fae659ac5 100644 --- a/docs/source/en/conversations.md +++ b/docs/source/en/conversations.md @@ -281,7 +281,7 @@ confirm these generations with the chat model. If the guesses are validated by t be generated per forward pass, which greatly alleviates the bandwidth bottleneck and improves generation speed. Finally, we should also note the impact of "Mixture of Experts" (MoE) models here. Several popular chat models, -such as Mixtral, Qwen-MoE and DBRX, are MoE models. In these models, not every parameter is active for every token generated. +such as Mixtral, MiniMaxText01, Qwen-MoE and DBRX, are MoE models. In these models, not every parameter is active for every token generated. As a result, MoE models generally have much lower memory bandwidth requirements, even though their total size can be quite large. They can therefore be several times faster than a normal "dense" model of the same size. However, techniques like assisted generation are generally ineffective for these models because more parameters will become diff --git a/docs/source/en/index.md b/docs/source/en/index.md index ace8f76f7d0..8615c0eef56 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -228,6 +228,7 @@ Flax), PyTorch, and/or TensorFlow. | [Mimi](model_doc/mimi) | ✅ | ❌ | ❌ | | [Mistral](model_doc/mistral) | ✅ | ✅ | ✅ | | [Mixtral](model_doc/mixtral) | ✅ | ❌ | ❌ | +| [MiniMaxText01](model_doc/minimax_text_01) | ✅ | ❌ | ❌ | | [Mllama](model_doc/mllama) | ✅ | ❌ | ❌ | | [mLUKE](model_doc/mluke) | ✅ | ❌ | ❌ | | [MMS](model_doc/mms) | ✅ | ✅ | ✅ | diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index d9bdf6f6e48..3fff55e2505 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -77,6 +77,7 @@ FlashAttention-2 is currently supported for the following architectures: * [MBart](https://huggingface.co/docs/transformers/model_doc/mbart#transformers.MBartModel) * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel) * [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel) +* [MiniMaxText01](https://huggingface.co/docs/transformers/model_doc/minimax_text_01#transformers.MiniMaxText01Model) * [ModernBert](https://huggingface.co/docs/transformers/model_doc/modernbert#transformers.ModernBert) * [Moshi](https://huggingface.co/docs/transformers/model_doc/moshi#transformers.MoshiModel) * [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel) @@ -274,6 +275,7 @@ For now, Transformers supports SDPA inference and training for the following arc * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel) * [Mllama](https://huggingface.co/docs/transformers/model_doc/mllama#transformers.MllamaForConditionalGeneration) * [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel) +* [MiniMaxText01](https://huggingface.co/docs/transformers/model_doc/minimax_text_01#transformers.MiniMaxText01Model) * [ModernBert](https://huggingface.co/docs/transformers/model_doc/modernbert#transformers.ModernBert) * [Moshi](https://huggingface.co/docs/transformers/model_doc/moshi#transformers.MoshiModel) * [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel) @@ -292,6 +294,7 @@ For now, Transformers supports SDPA inference and training for the following arc * [Moonshine](https://huggingface.co/docs/transformers/model_doc/moonshine#transformers.MoonshineModel) * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel) * [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel) +* [MiniMaxText01](https://huggingface.co/docs/transformers/model_doc/minimax_text_01#transformers.MiniMaxText01Model) * [StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm#transformers.StableLmModel) * [Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2#transformers.Starcoder2Model) * [Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2#transformers.Qwen2Model) diff --git a/docs/source/ko/conversations.md b/docs/source/ko/conversations.md index 920cb138786..daae2fb69ae 100644 --- a/docs/source/ko/conversations.md +++ b/docs/source/ko/conversations.md @@ -296,7 +296,7 @@ pipe = pipeline("text-generation", "meta-llama/Meta-Llama-3-8B-Instruct", device 병목 현상이 크게 줄어들고 생성 속도가 빨라집니다. 마지막으로, "Mixture of Experts" (MoE) 모델에 대해서도 짚고 넘어가 보도록 합니다. -Mixtral, Qwen-MoE, DBRX와 같은 인기 있는 채팅 모델이 바로 MoE 모델입니다. +Mixtral, MiniMaxText01, Qwen-MoE, DBRX와 같은 인기 있는 채팅 모델이 바로 MoE 모델입니다. 이 모델들은 토큰을 생성할 때 모든 파라미터가 사용되지 않습니다. 이로 인해 MoE 모델은 전체 크기가 상당히 클 수 있지만, 차지하는 메모리 대역폭은 낮은 편입니다. diff --git a/docs/source/zh/index.md b/docs/source/zh/index.md index 3750e506b0e..3f032cedffd 100644 --- a/docs/source/zh/index.md +++ b/docs/source/zh/index.md @@ -191,6 +191,7 @@ rendered properly in your Markdown viewer. | [MGP-STR](../en/model_doc/mgp-str) | ✅ | ❌ | ❌ | | [Mistral](../en/model_doc/mistral) | ✅ | ❌ | ✅ | | [Mixtral](../en/model_doc/mixtral) | ✅ | ❌ | ❌ | +| [MiniMaxText01](../en/model_doc/minimax_text_01) | ✅ | ❌ | ❌ | | [mLUKE](../en/model_doc/mluke) | ✅ | ❌ | ❌ | | [MMS](../en/model_doc/mms) | ✅ | ✅ | ✅ | | [MobileBERT](../en/model_doc/mobilebert) | ✅ | ✅ | ❌ | diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 64b5f4b52c4..1566a02dc2c 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -602,6 +602,7 @@ _import_structure = { "models.mimi": ["MimiConfig"], "models.mistral": ["MistralConfig"], "models.mixtral": ["MixtralConfig"], + "models.minimax_text_01": ["MiniMaxText01Config"], "models.mllama": [ "MllamaConfig", "MllamaProcessor", @@ -2868,6 +2869,16 @@ else: "MixtralPreTrainedModel", ] ) + _import_structure["models.minimax_text_01"].extend( + [ + "MiniMaxText01ForCausalLM", + "MiniMaxText01ForQuestionAnswering", + "MiniMaxText01ForSequenceClassification", + "MiniMaxText01ForTokenClassification", + "MiniMaxText01Model", + "MiniMaxText01PreTrainedModel", + ] + ) _import_structure["models.mllama"].extend( [ "MllamaForCausalLM", @@ -5657,6 +5668,7 @@ if TYPE_CHECKING: ) from .models.mistral import MistralConfig from .models.mixtral import MixtralConfig + from .models.minimax_text_01 import MiniMaxText01Config from .models.mllama import ( MllamaConfig, MllamaProcessor, @@ -7661,6 +7673,14 @@ if TYPE_CHECKING: MixtralModel, MixtralPreTrainedModel, ) + from .models.minimax_text_01 import ( + MiniMaxText01Model, + MiniMaxText01ForCausalLM, + MiniMaxText01ForSequenceClassification, + MiniMaxText01ForTokenClassification, + MiniMaxText01ForQuestionAnswering, + MiniMaxText01PreTrainedModel, + ) from .models.mllama import ( MllamaForCausalLM, MllamaForConditionalGeneration, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 69c3c26cfa2..086d23800e4 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -164,6 +164,7 @@ from . import ( mimi, mistral, mixtral, + minimax_text_01, mllama, mluke, mobilebert, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index e93c201bb0c..c540c45f6d1 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -185,6 +185,7 @@ CONFIG_MAPPING_NAMES = OrderedDict( ("mimi", "MimiConfig"), ("mistral", "MistralConfig"), ("mixtral", "MixtralConfig"), + ("minimax_text_01", "MiniMaxText01Config"), ("mllama", "MllamaConfig"), ("mobilebert", "MobileBertConfig"), ("mobilenet_v1", "MobileNetV1Config"), @@ -516,6 +517,7 @@ MODEL_NAMES_MAPPING = OrderedDict( ("mimi", "Mimi"), ("mistral", "Mistral"), ("mixtral", "Mixtral"), + ("minimax_text_01", "MiniMaxText01"), ("mllama", "Mllama"), ("mluke", "mLUKE"), ("mms", "MMS"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index fc24fd4dcfa..0b8be71c712 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -174,6 +174,7 @@ MODEL_MAPPING_NAMES = OrderedDict( ("mimi", "MimiModel"), ("mistral", "MistralModel"), ("mixtral", "MixtralModel"), + ("minimax_text_01", "MiniMaxText01Model"), ("mobilebert", "MobileBertModel"), ("mobilenet_v1", "MobileNetV1Model"), ("mobilenet_v2", "MobileNetV2Model"), @@ -531,6 +532,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict( ("megatron-bert", "MegatronBertForCausalLM"), ("mistral", "MistralForCausalLM"), ("mixtral", "MixtralForCausalLM"), + ("minimax_text_01", "MiniMaxText01ForCausalLM"), ("mllama", "MllamaForCausalLM"), ("moshi", "MoshiForCausalLM"), ("mpt", "MptForCausalLM"), @@ -1010,6 +1012,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict( ("megatron-bert", "MegatronBertForSequenceClassification"), ("mistral", "MistralForSequenceClassification"), ("mixtral", "MixtralForSequenceClassification"), + ("minimax_text_01", "MiniMaxText01ForSequenceClassification"), ("mobilebert", "MobileBertForSequenceClassification"), ("modernbert", "ModernBertForSequenceClassification"), ("mpnet", "MPNetForSequenceClassification"), @@ -1098,6 +1101,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict( ("megatron-bert", "MegatronBertForQuestionAnswering"), ("mistral", "MistralForQuestionAnswering"), ("mixtral", "MixtralForQuestionAnswering"), + ("minimax_text_01", "MiniMaxText01ForQuestionAnswering"), ("mobilebert", "MobileBertForQuestionAnswering"), ("mpnet", "MPNetForQuestionAnswering"), ("mpt", "MptForQuestionAnswering"), @@ -1200,6 +1204,7 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict( ("megatron-bert", "MegatronBertForTokenClassification"), ("mistral", "MistralForTokenClassification"), ("mixtral", "MixtralForTokenClassification"), + ("minimax_text_01", "MiniMaxText01ForTokenClassification"), ("mobilebert", "MobileBertForTokenClassification"), ("modernbert", "ModernBertForTokenClassification"), ("mpnet", "MPNetForTokenClassification"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 9ce9edd06cb..8e2ae7452dc 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -319,6 +319,13 @@ else: "LlamaTokenizerFast" if is_tokenizers_available() else None, ), ), + ( + "minimax_text_01", + ( + "GPT2Tokenizer" if is_sentencepiece_available() else None, + "GPT2TokenizerFast" if is_tokenizers_available() else None, + ), + ), ("mllama", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)), ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)), diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 2574af7e8a4..799467cd084 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -6315,6 +6315,48 @@ class MixtralPreTrainedModel(metaclass=DummyObject): requires_backends(self, ["torch"]) +class MiniMaxText01ForCausalLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class MiniMaxText01ForQuestionAnswering(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class MiniMaxText01ForSequenceClassification(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class MiniMaxText01ForTokenClassification(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class MiniMaxText01Model(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class MiniMaxText01PreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class MllamaForCausalLM(metaclass=DummyObject): _backends = ["torch"] diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py index 45fa3d9ca68..c70b45a4b13 100755 --- a/src/transformers/utils/fx.py +++ b/src/transformers/utils/fx.py @@ -150,6 +150,7 @@ _REGULAR_SUPPORTED_MODEL_NAMES_AND_TASKS = [ "megatron-bert", "mistral", "mixtral", + "minimax_text_01", "mobilebert", "mt5", "nezha", diff --git a/tests/repo_utils/modular/test_conversion_order.py b/tests/repo_utils/modular/test_conversion_order.py index f5e133ce1fe..1dbd18f71a4 100644 --- a/tests/repo_utils/modular/test_conversion_order.py +++ b/tests/repo_utils/modular/test_conversion_order.py @@ -19,6 +19,7 @@ FILES_TO_PARSE = [ os.path.join(MODEL_ROOT, "granite", "modular_granite.py"), os.path.join(MODEL_ROOT, "gemma2", "modular_gemma2.py"), os.path.join(MODEL_ROOT, "mixtral", "modular_mixtral.py"), + os.path.join(MODEL_ROOT, "minimax_text_01", "modular_minimax_text_01.py"), os.path.join(MODEL_ROOT, "olmo", "modular_olmo.py"), os.path.join(MODEL_ROOT, "rt_detr", "modular_rt_detr.py"), os.path.join(MODEL_ROOT, "qwen2", "modular_qwen2.py"), @@ -53,6 +54,7 @@ class ConversionOrderTest(unittest.TestCase): model_priority_list = [file.rsplit("modular_")[-1].replace(".py", "") for file in priority_list] # These are based on what the current library order should be (as of 09/01/2025) + self.assertTrue(appear_after("minimax_text_01", "mixtral", model_priority_list)) self.assertTrue(appear_after("mixtral", "mistral", model_priority_list)) self.assertTrue(appear_after("gemma2", "gemma", model_priority_list)) self.assertTrue(appear_after("starcoder2", "mistral", model_priority_list)) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index cf259fabe30..3ce9a84adb3 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -4327,7 +4327,7 @@ class ModelTesterMixin: if not self.has_attentions: self.skipTest(reason="Model architecture does not support attentions") - WINDOW_ATTENTION_MODELS = ["mistral", "mixtral", "qwen2", "qwen_moe", "starcoder2"] + WINDOW_ATTENTION_MODELS = ["mistral", "mixtral", "minimax_text_01", "qwen2", "qwen_moe", "starcoder2"] if len(self.all_generative_model_classes) == 0: self.skipTest(f"No generative model classes for {self.__class__.__name__}") diff --git a/utils/not_doctested.txt b/utils/not_doctested.txt index 0a36fcbd8a5..0a9c635f197 100644 --- a/utils/not_doctested.txt +++ b/utils/not_doctested.txt @@ -165,6 +165,7 @@ docs/source/en/model_doc/megatron_gpt2.md docs/source/en/model_doc/mgp-str.md docs/source/en/model_doc/mistral.md docs/source/en/model_doc/mixtral.md +docs/source/en/model_doc/minimax_text_01.md docs/source/en/model_doc/mluke.md docs/source/en/model_doc/mms.md docs/source/en/model_doc/mobilebert.md @@ -675,6 +676,8 @@ src/transformers/models/mistral/configuration_mistral.py src/transformers/models/mistral/modeling_mistral.py src/transformers/models/mixtral/configuration_mixtral.py src/transformers/models/mixtral/modeling_mixtral.py +src/transformers/models/minimax_text_01/configuration_minimax_text_01.py +src/transformers/models/minimax_text_01/modeling_minimax_text_01.py src/transformers/models/mluke/convert_mluke_original_pytorch_checkpoint_to_pytorch.py src/transformers/models/mobilebert/convert_mobilebert_original_tf_checkpoint_to_pytorch.py src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py