# Copyright 2024 Mistral AI and The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Testing suite for the TF 2.0 Mistral model.""" import unittest import numpy as np from transformers import AutoTokenizer, MistralConfig, is_tf_available from transformers.testing_utils import ( require_tf, slow, ) from ...test_configuration_common import ConfigTester from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask from ...test_pipeline_mixin import PipelineTesterMixin if is_tf_available(): import tensorflow as tf from transformers.models.mistral.modeling_tf_mistral import ( TFMistralForCausalLM, TFMistralForSequenceClassification, TFMistralModel, ) class TFMistralModelTester: def __init__(self, parent): self.parent = parent self.batch_size = 13 self.seq_length = 7 self.is_training = True self.use_input_mask = True self.use_token_type_ids = False self.use_labels = True self.vocab_size = 99 self.hidden_size = 32 self.num_hidden_layers = 2 self.num_attention_heads = 4 self.num_key_value_heads = 2 self.intermediate_size = 37 self.hidden_act = "gelu" self.hidden_dropout_prob = 0.1 self.attention_probs_dropout_prob = 0.1 self.max_position_embeddings = 512 self.type_vocab_size = 16 self.type_sequence_label_size = 2 self.initializer_range = 0.02 self.num_labels = 3 self.num_choices = 4 self.pad_token_id = 0 self.scope = None self.bos_token_id = self.vocab_size - 1 self.eos_token_id = self.vocab_size - 1 self.pad_token_id = self.vocab_size - 1 def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_mask = None if self.use_input_mask: input_mask = random_attention_mask([self.batch_size, self.seq_length], self.vocab_size) token_type_ids = None if self.use_token_type_ids: token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) sequence_labels = None token_labels = None choice_labels = None if self.use_labels: sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) choice_labels = ids_tensor([self.batch_size], self.num_choices) config = MistralConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, num_key_value_heads=self.num_key_value_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, is_decoder=False, initializer_range=self.initializer_range, pad_token_id=self.pad_token_id, ) return ( config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, ) def create_and_check_model( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = TFMistralModel(config=config) result = model(input_ids, attention_mask=input_mask) result = model(input_ids) self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) def create_and_check_model_as_decoder( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask, ): config.add_cross_attention = True model = TFMistralModel(config) result = model( input_ids, attention_mask=input_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, ) result = model( input_ids, attention_mask=input_mask, encoder_hidden_states=encoder_hidden_states, ) result = model(input_ids, attention_mask=input_mask) self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) def create_and_check_for_causal_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask, ): model = TFMistralForCausalLM(config=config) result = model(input_ids, attention_mask=input_mask, labels=token_labels) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) def create_and_check_decoder_model_past_large_inputs( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask, ): config.is_decoder = True config.add_cross_attention = True model = TFMistralForCausalLM(config=config) # first forward pass outputs = model( input_ids, attention_mask=input_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, use_cache=True, ) past_key_values = outputs.past_key_values # create hypothetical multiple next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) next_mask = ids_tensor((self.batch_size, 3), vocab_size=2) # append to next input_ids and next_input_ids = tf.concat([input_ids, next_tokens], axis=-1) next_attention_mask = tf.concat([input_mask, next_mask], axis=-1) output_from_no_past = model( next_input_ids, attention_mask=next_attention_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, output_hidden_states=True, )["hidden_states"][0] output_from_past = model( next_tokens, attention_mask=next_attention_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, past_key_values=past_key_values, output_hidden_states=True, )["hidden_states"][0] # select random slice random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach() output_from_past_slice = output_from_past[:, :, random_slice_idx].detach() self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1]) # test that outputs are equal for slice self.parent.assertTrue(np.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() ( config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, ) = config_and_inputs inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} return config, inputs_dict @require_tf class TFMistralModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = ( (TFMistralModel, TFMistralForCausalLM, TFMistralForSequenceClassification) if is_tf_available() else () ) all_generative_model_classes = (TFMistralForCausalLM,) if is_tf_available() else () pipeline_model_mapping = ( { "feature-extraction": TFMistralModel, "text-classification": TFMistralForSequenceClassification, "text-generation": TFMistralForCausalLM, "zero-shot": TFMistralForSequenceClassification, } if is_tf_available() else {} ) test_onnx = False test_pruning = False test_missing_keys = False test_head_masking = False # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146 def is_pipeline_test_to_skip( self, pipeline_test_case_name, config_class, model_architecture, tokenizer_name, image_processor_name, feature_extractor_name, processor_name, ): return True def setUp(self): self.model_tester = TFMistralModelTester(self) self.config_tester = ConfigTester(self, config_class=MistralConfig, hidden_size=37) def test_config(self): self.config_tester.run_common_tests() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) def test_model_various_embeddings(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() for type in ["absolute", "relative_key", "relative_key_query"]: config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) def test_Mistral_sequence_classification_model(self): config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() config.num_labels = 3 input_ids = input_dict["input_ids"] attention_mask = tf.not_equal(input_ids, 1) sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size) model = TFMistralForSequenceClassification(config) result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) def test_Mistral_sequence_classification_model_for_single_label(self): config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() config.num_labels = 3 config.problem_type = "single_label_classification" input_ids = input_dict["input_ids"] attention_mask = tf.not_equal(input_ids, 1) sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size) model = TFMistralForSequenceClassification(config) result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) def test_Mistral_sequence_classification_model_for_multi_label(self): config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() config.num_labels = 3 config.problem_type = "multi_label_classification" input_ids = input_dict["input_ids"] attention_mask = tf.not_equal(input_ids, 1) sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size) model = TFMistralForSequenceClassification(config) result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) @unittest.skip("Vocab resizing is not supported") def test_save_load_after_resize_token_embeddings(self): pass @require_tf class TFMistralIntegrationTest(unittest.TestCase): @slow def test_model_7b_logits(self): input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338] model = TFMistralForCausalLM.from_pretrained( "hf-internal-testing/tiny-random-MistralForCausalLM", from_pt=True ) input_ids = tf.constant([input_ids]) out = model(input_ids).logits # Expected mean on dim = -1 EXPECTED_MEAN = tf.constant( [[-1.281e-04, -2.869e-04, -9.989e-05, -8.995e-05, 2.494e-04, -3.083e-04, -2.672e-04, -1.239e-04]] ) tf.debugging.assert_near(tf.reduce_mean(out, axis=-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2) # slicing logits[0, 0, 0:30] EXPECTED_SLICE = tf.constant([0.1033, 0.1493, -0.0041, -0.0021, -0.1686, 0.0356, 0.0812, 0.2218, -0.1257, 0.1920, 0.0929, 0.1181, 0.0111, 0.0395, -0.0064, 0.1712, -0.0751, 0.0625, -0.2409, 0.1541, -0.1271, -0.2296, -0.0099, -0.0160, 0.0311, -0.0824, -0.1518, 0.0722, 0.0187, 0.0484]) # fmt: skip tf.debugging.assert_near(out[0, 0, :30], EXPECTED_SLICE, atol=1e-4, rtol=1e-4) @slow def test_model_7b_generation(self): EXPECTED_TEXT_COMPLETION = """My favourite condiment is Werk a EgyadjustPrintfigiousPDFPHPct guns Ein motor conceti barSequ内 infrastructure millretval""" prompt = "My favourite condiment is " tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM", use_fast=False) model = TFMistralForCausalLM.from_pretrained( "hf-internal-testing/tiny-random-MistralForCausalLM", from_pt=True ) input_ids = tokenizer.encode(prompt, return_tensors="tf") # greedy generation outputs generated_ids = model.generate(input_ids, max_new_tokens=20, temperature=0) text = tokenizer.decode(generated_ids[0], skip_special_tokens=True) self.assertEqual(EXPECTED_TEXT_COMPLETION, text)