transformers/tests/models/clvp/test_modeling_clvp.py

# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Clvp model."""

import tempfile
import unittest

import datasets
import numpy as np

from transformers import ClvpConfig, ClvpDecoderConfig, ClvpEncoderConfig
from transformers.testing_utils import (
    cleanup,
    require_torch,
    slow,
    torch_device,
)
from transformers.utils import is_torch_available

from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
    ModelTesterMixin,
    _config_zero_init,
    ids_tensor,
    random_attention_mask,
)
from ...test_pipeline_mixin import PipelineTesterMixin


if is_torch_available():
    import torch

    from transformers import ClvpEncoder, ClvpForCausalLM, ClvpModel, ClvpModelForConditionalGeneration

from transformers import ClvpFeatureExtractor, ClvpTokenizer


class ClvpEncoderTester:
    def __init__(
        self,
        parent,
        batch_size=2,
        seq_length=7,
        is_training=False,
        use_input_mask=True,
        use_labels=True,
        vocab_size=50,
        hidden_size=128,
        projection_dim=16,
        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=32,
        dropout=0.1,
        attention_dropout=0.1,
        initializer_range=0.02,
        scope=None,
    ):
        self.parent = parent
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.is_training = is_training
        self.use_input_mask = use_input_mask
        self.use_labels = use_labels
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.projection_dim = projection_dim
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.initializer_range = initializer_range
        self.scope = scope
        self.bos_token_id = vocab_size - 1
        self.eos_token_id = vocab_size - 1

    def get_config(self):
        encoder_config = ClvpEncoderConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            projection_dim=self.projection_dim,
            num_hidden_layers=self.num_hidden_layers,
            num_attention_heads=self.num_attention_heads,
            intermediate_size=self.intermediate_size,
            dropout=self.dropout,
            attention_dropout=self.attention_dropout,
            initializer_range=self.initializer_range,
            bos_token_id=self.bos_token_id,
            eos_token_id=self.eos_token_id,
        )

        return encoder_config

    def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)

        input_mask = None
        if self.use_input_mask:
            input_mask = random_attention_mask([self.batch_size, self.seq_length])

        if input_mask is not None:
            batch_size, seq_length = input_mask.shape
            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
            for batch_idx, start_index in enumerate(rnd_start_indices):
                input_mask[batch_idx, :start_index] = 1
                input_mask[batch_idx, start_index:] = 0

        encoder_config = self.get_config()

        return encoder_config, input_ids, input_mask

    def prepare_config_and_inputs_for_common(self):
        config_and_inputs = self.prepare_config_and_inputs()
        speech_config, input_ids, input_mask = config_and_inputs
        inputs_dict = {"input_ids": input_ids.to(torch_device), "attention_mask": input_mask.to(torch_device)}
        return speech_config, inputs_dict

    def create_and_check_model(self, speech_config, input_ids, input_mask):
        text_config = ClvpEncoderConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            projection_dim=self.projection_dim,
            num_hidden_layers=self.num_hidden_layers,
            num_attention_heads=self.num_attention_heads,
            intermediate_size=self.intermediate_size,
            dropout=self.dropout,
            attention_dropout=self.attention_dropout,
            initializer_range=self.initializer_range,
        )
        text_encoder_model = ClvpEncoder(config=text_config)
        text_encoder_model.to(torch_device)
        text_encoder_model.eval()
        with torch.no_grad():
            result = text_encoder_model(input_ids, attention_mask=input_mask)
            result = text_encoder_model(input_ids)
        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
        self.parent.assertEqual(result[0].shape, (self.batch_size, self.projection_dim))

        # now check with speech config
        speech_encoder_model = ClvpEncoder(config=speech_config)
        speech_encoder_model.to(torch_device)
        speech_encoder_model.eval()
        with torch.no_grad():
            result = speech_encoder_model(input_ids, attention_mask=input_mask)
            result = speech_encoder_model(input_ids)
        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
        self.parent.assertEqual(result[0].shape, (self.batch_size, self.projection_dim))


@require_torch
class ClvpEncoderTest(ModelTesterMixin, unittest.TestCase):
    all_model_classes = (ClvpEncoder,) if is_torch_available() else ()
    test_pruning = False
    test_head_masking = False
    test_torchscript = False

    def setUp(self):
        self.model_tester = ClvpEncoderTester(self)
        self.encoder_config_tester = ConfigTester(self, config_class=ClvpEncoderConfig, hidden_size=32)

    def tearDown(self):
        super().tearDown()
        # clean-up as much as possible GPU memory occupied by PyTorch
        cleanup(torch_device)

    def test_config(self):
        self.encoder_config_tester.run_common_tests()

    def test_model(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model(*config_and_inputs)

    @unittest.skip(reason="ClvpEncoder does not output loss")
    def test_training(self):
        pass

    @unittest.skip(reason="ClvpEncoder does not output loss")
    def test_training_gradient_checkpointing(self):
        pass


class ClvpDecoderTester:
    def __init__(
        self,
        parent,
        batch_size=2,
        seq_length=3,
        is_training=False,
        vocab_size=300,
        max_position_embeddings=256,
        max_text_tokens=256,
        use_input_mask=True,
        hidden_size=128,
        num_hidden_layers=2,
        num_attention_heads=2,
        bos_token_id=97,
        eos_token_id=98,
        relative_attention_num_buckets=4,
        relative_attention_max_distance=16,
    ):
        self.parent = parent
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.is_training = is_training
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.max_text_tokens = max_text_tokens
        self.use_input_mask = use_input_mask
        self.hidden_size = hidden_size
        self.num_attention_heads = num_attention_heads
        self.num_hidden_layers = num_hidden_layers
        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id
        self.relative_attention_num_buckets = relative_attention_num_buckets
        self.relative_attention_max_distance = relative_attention_max_distance

    def get_config(self):
        decoder_config = ClvpDecoderConfig(
            vocab_size=self.vocab_size,
            max_position_embeddings=self.max_position_embeddings,
            max_text_tokens=self.max_text_tokens,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
            num_attention_heads=self.num_attention_heads,
            bos_token_id=self.bos_token_id,
            eos_token_id=self.eos_token_id,
            relative_attention_num_buckets=self.relative_attention_num_buckets,
            relative_attention_max_distance=self.relative_attention_max_distance,
        )

        return decoder_config

    def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)

        input_mask = None
        if self.use_input_mask:
            input_mask = random_attention_mask([self.batch_size, self.seq_length])

        if input_mask is not None:
            batch_size, seq_length = input_mask.shape
            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
            for batch_idx, start_index in enumerate(rnd_start_indices):
                input_mask[batch_idx, :start_index] = 1
                input_mask[batch_idx, start_index:] = 0

        decoder_config = self.get_config()

        return decoder_config, input_ids, input_mask

    def create_and_check_model(self, config, input_ids, attention_mask):
        model = ClvpForCausalLM(config).to(torch_device).eval()
        with torch.no_grad():
            result = model(input_ids=input_ids, attention_mask=attention_mask)

        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.vocab_size))

    def prepare_config_and_inputs_for_common(self):
        config_and_inputs = self.prepare_config_and_inputs()
        config, input_ids, attention_mask = config_and_inputs
        inputs_dict = {
            "input_ids": input_ids.to(torch_device),
            "attention_mask": attention_mask.to(torch_device),
        }
        return config, inputs_dict


@require_torch
class ClvpDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (ClvpModel, ClvpForCausalLM) if is_torch_available() else ()
    pipeline_model_mapping = {"feature-extraction": ClvpModelForConditionalGeneration} if is_torch_available() else {}

    test_pruning = False

    def setUp(self):
        self.model_tester = ClvpDecoderTester(self)
        self.decoder_config_tester = ConfigTester(self, config_class=ClvpDecoderConfig, hidden_size=32)

    def tearDown(self):
        super().tearDown()
        # clean-up as much as possible GPU memory occupied by PyTorch
        cleanup(torch_device)

    def test_model(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model(*config_and_inputs)

    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
        if return_labels and model_class == ClvpForCausalLM:
            inputs_dict["labels"] = torch.zeros(
                [self.model_tester.batch_size, self.model_tester.seq_length], device=torch_device
            ).long()

        return inputs_dict

    def test_training(self):
        # we will only test the ClvpForCausalLM since it outputs loss
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        config.return_dict = True

        model = ClvpForCausalLM(config)
        model.to(torch_device)
        model.train()
        inputs = self._prepare_for_class(inputs_dict, ClvpForCausalLM, return_labels=True)
        loss = model(**inputs).loss
        loss.backward()

    def test_training_gradient_checkpointing(self):
        # we will only test the ClvpForCausalLM since it outputs loss
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        config.use_cache = False
        config.return_dict = True

        model = ClvpForCausalLM(config)
        model.to(torch_device)
        model.gradient_checkpointing_enable()
        model.train()
        inputs = self._prepare_for_class(inputs_dict, ClvpForCausalLM, return_labels=True)

        loss = model(**inputs).loss
        loss.backward()

    @unittest.skip(reason="Clvp `prepare_inputs_for_generation` function doesn't have cache position.")
    def test_generate_continue_from_inputs_embeds(self):
        pass


class ClvpModelForConditionalGenerationTester:
    def __init__(self, parent, is_training=False):
        self.parent = parent
        self.clvp_encoder_tester = ClvpEncoderTester(parent)
        self.is_training = is_training
        self.batch_size = self.clvp_encoder_tester.batch_size  # need bs for batching_equivalence test

    def get_config(self):
        decoder_config = ClvpDecoderConfig(
            vocab_size=50,
            max_position_embeddings=30,
            max_text_tokens=30,
            hidden_size=128,
            num_hidden_layers=1,
            num_attention_heads=2,
            bos_token_id=97,
            eos_token_id=98,
            relative_attention_num_buckets=4,
            relative_attention_max_distance=16,
        )
        text_config = self.clvp_encoder_tester.get_config()
        speech_config = self.clvp_encoder_tester.get_config()
        speech_config.vocab_size = 300

        return ClvpConfig.from_sub_model_configs(
            text_config,
            speech_config,
            decoder_config,
            projection_dim=16,
        )

    def prepare_config_and_inputs(self):
        _, input_ids, attention_mask = self.clvp_encoder_tester.prepare_config_and_inputs()

        ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
        _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()

        feature_extractor = ClvpFeatureExtractor()
        input_features = feature_extractor(raw_speech=audio, sampling_rate=sr, return_tensors="pt")[
            "input_features"
        ].to(torch_device)

        config = self.get_config()

        return config, input_ids, attention_mask, input_features

    def create_and_check_model(self, config, input_ids, attention_mask, input_features):
        model = ClvpModelForConditionalGeneration(config).to(torch_device).eval()
        with torch.no_grad():
            result = model(input_ids=input_ids, input_features=input_features, attention_mask=attention_mask)

        self.parent.assertEqual(result.logits_per_speech.shape, (2, self.clvp_encoder_tester.batch_size))
        self.parent.assertEqual(result.logits_per_text.shape, (self.clvp_encoder_tester.batch_size, 2))

    def prepare_config_and_inputs_for_common(self):
        config_and_inputs = self.prepare_config_and_inputs()
        config, input_ids, attention_mask, input_features = config_and_inputs
        inputs_dict = {
            "input_ids": input_ids.to(torch_device),
            "attention_mask": attention_mask.to(torch_device),
            "input_features": input_features.to(torch_device),
            "return_loss": False,
        }
        return config, inputs_dict


@require_torch
class ClvpModelForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase):
    all_model_classes = (ClvpModelForConditionalGeneration,) if is_torch_available() else ()
    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
    all_generative_model_classes = ()

    test_head_masking = False
    test_pruning = False
    test_resize_embeddings = False
    test_attention_outputs = False
    test_torchscript = False

    def setUp(self):
        self.model_tester = ClvpModelForConditionalGenerationTester(self)
        common_properties = ["projection_dim", "logit_scale_init_value"]
        self.clvp_config_tester = ConfigTester(
            self, config_class=ClvpConfig, has_text_modality=False, common_properties=common_properties, hidden_size=32
        )

    def test_config(self):
        self.clvp_config_tester.run_common_tests()

    def tearDown(self):
        super().tearDown()
        # clean-up as much as possible GPU memory occupied by PyTorch
        cleanup(torch_device)

    def test_model(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model(*config_and_inputs)

    def test_hidden_states_output(self):
        def check_hidden_states_output(inputs_dict, config, model_class):
            model = model_class(config)
            model.to(torch_device)
            model.eval()

            with torch.no_grad():
                outputs = model(**self._prepare_for_class(inputs_dict, model_class))

            # check for decoder model, text encoder model and speech encoder model hidden states
            decoder_hidden_states = outputs.decoder_hidden_states
            text_encoder_hidden_states = outputs.text_encoder_hidden_states
            speech_encoder_hidden_states = outputs.speech_encoder_hidden_states

            # check length of the hidden states
            expected_decoder_num_layers = config.decoder_config.num_hidden_layers + 1
            self.assertEqual(len(decoder_hidden_states), expected_decoder_num_layers)

            expected_speech_encoder_num_layers = config.text_config.num_hidden_layers + 1
            self.assertEqual(len(text_encoder_hidden_states), expected_speech_encoder_num_layers)

            expected_text_encoder_num_layers = config.speech_config.num_hidden_layers + 1
            self.assertEqual(len(speech_encoder_hidden_states), expected_text_encoder_num_layers)

            # check shapes of each hidden state

            # for the decoder model we will only test the dimension because the ClvpConditioningEncoder could increase
            # the sequence lengths.
            self.assertEqual(decoder_hidden_states[0].shape[-1], config.decoder_config.hidden_size)

            # the testing for text encoder stays standard because we just pass the text tokens here.
            self.assertListEqual(
                list(text_encoder_hidden_states[0].shape[-2:]),
                [self.model_tester.clvp_encoder_tester.seq_length, config.text_config.hidden_size],
            )

            # for the decoder model we will only test the dimension because the fix_decoder_outputs method could increase
            # the sequence lengths by adding `decoder_fixing_codes` tokens at the end.
            self.assertEqual(speech_encoder_hidden_states[0].shape[-1], config.speech_config.hidden_size)

        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
            inputs_dict["output_hidden_states"] = True
            check_hidden_states_output(inputs_dict, config, model_class)

            # check that output_hidden_states also work using config
            del inputs_dict["output_hidden_states"]
            config.output_hidden_states = True

            check_hidden_states_output(inputs_dict, config, model_class)

    @unittest.skip(reason="Retain_grad is tested in individual model tests")
    def test_retain_grad_hidden_states_attentions(self):
        pass

    @unittest.skip(reason="ClvpModelForConditionalGeneration does not have get_input_embeddings")
    def test_inputs_embeds(self):
        pass

    @unittest.skip(reason="ClvpModelForConditionalGeneration does not have get_input_embeddings")
    def test_model_get_set_embeddings(self):
        pass

    # override as the `logit_scale` parameter initialization is different for Clvp
    def test_initialization(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        configs_no_init = _config_zero_init(config)
        for model_class in self.all_model_classes:
            model = model_class(config=configs_no_init)
            for name, param in model.named_parameters():
                if param.requires_grad:
                    # check if `logit_scale` is initialized as per the original implementation
                    if name == "logit_scale":
                        expected_value = np.log(1 / 0.07)
                        returned_value = param.data.item()

                        self.assertAlmostEqual(
                            returned_value,
                            expected_value,
                            delta=1e-3,
                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                        )
                    else:
                        expected_range = [0.0, 1.0]
                        returned_range = ((param.data.mean() * 1e9).round() / 1e9).item()

                        self.assertIn(
                            returned_range,
                            expected_range,
                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                        )

    def test_load_speech_text_decoder_config(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        # Save ClvpConfig and check if we can load ClvpEncoderConfig from it
        with tempfile.TemporaryDirectory() as tmp_dir_name:
            config.save_pretrained(tmp_dir_name)
            encoder_config = ClvpEncoderConfig.from_pretrained(tmp_dir_name)
            self.assertDictEqual(config.text_config.to_dict(), encoder_config.to_dict())

        # Save ClvpConfig and check if we can load ClvpDecoderConfig from it
        with tempfile.TemporaryDirectory() as tmp_dir_name:
            config.save_pretrained(tmp_dir_name)
            decoder_config = ClvpDecoderConfig.from_pretrained(tmp_dir_name)
            self.assertDictEqual(config.decoder_config.to_dict(), decoder_config.to_dict())

    @slow
    def test_model_from_pretrained(self):
        model_name = "susnato/clvp_dev"
        model = ClvpModelForConditionalGeneration.from_pretrained(model_name)
        self.assertIsNotNone(model)


# Since Clvp has a lot of different models connected with each other it's better to test each of them individually along
# with a test_full_model_integration. If the model breaks in future, it could be of a great help to identify the broken part.


@slow
@require_torch
class ClvpIntegrationTest(unittest.TestCase):
    def setUp(self):
        self.text = "This is an example text."
        ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
        _, self.speech_samples, self.sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()

        self.model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev").to(torch_device)
        self.model.eval()
        tokenizer = ClvpTokenizer.from_pretrained("susnato/clvp_dev")
        feature_extractor = ClvpFeatureExtractor.from_pretrained("susnato/clvp_dev")

        tokenizer_output = tokenizer(self.text, return_tensors="pt")
        self.text_tokens = tokenizer_output["input_ids"].to(torch_device)
        self.input_features = feature_extractor(
            raw_speech=self.speech_samples, sampling_rate=self.sr, return_tensors="pt"
        )["input_features"].to(torch_device)

    def tearDown(self):
        super().tearDown()
        # clean-up as much as possible GPU memory occupied by PyTorch
        cleanup(torch_device, gc_collect=True)

    def test_conditional_encoder(self):
        with torch.no_grad():
            conditioning_encoder_outputs = self.model.conditioning_encoder(
                input_features=self.input_features, input_ids=self.text_tokens
            ).to("cpu")

        self.assertEqual(
            conditioning_encoder_outputs.shape,
            torch.Size((self.input_features.shape[0], 18, self.model.config.decoder_config.hidden_size)),
        )

        EXPECTED_OUTPUTS = torch.tensor(
            [[-0.8582, 0.5228, 1.9944], [-0.0465, -1.1017, -0.0093], [-0.0466, -0.6030, -0.1280]]
        )

        torch.testing.assert_close(conditioning_encoder_outputs[0, :3, :3], EXPECTED_OUTPUTS, rtol=1e-4, atol=1e-4)

    def test_decoder_model_generate(self):
        autoregressive_model_output = self.model.speech_decoder_model.generate(input_ids=self.text_tokens).cpu()

        EXPECTED_OUTPUTS = torch.tensor([[147, 2, 54, 2, 43, 2, 169, 122, 29, 64, 2, 136, 37, 33, 9, 8193]])

        torch.testing.assert_close(autoregressive_model_output, EXPECTED_OUTPUTS)

    def test_text_and_speech_encoder_models(self):
        # check for text embeds
        text_embeds = self.model.text_encoder_model(input_ids=self.text_tokens, return_dict=True)[0].cpu()

        # fmt: off
        EXPECTED_TEXT_EMBEDS = torch.tensor([1.4798, -2.0005, 2.3902, -0.5042, 1.6401, -2.4135, -1.4800, 3.0118, -2.4422, 1.3266, 2.2339, 1.4761, -4.8983, -1.3592, 6.0251, 6.7364, 2.2576, 3.7229, -10.0436, 4.6676])
        # fmt: on

        torch.testing.assert_close(text_embeds[0, :20], EXPECTED_TEXT_EMBEDS, rtol=1e-4, atol=1e-4)

        # check for speech embeds
        speech_embeds = self.model.speech_encoder_model(input_ids=self.text_tokens, return_dict=True)[0].cpu()

        # fmt: off
        EXPECTED_SPEECH_EMBEDS = torch.tensor([3.1202, -3.1183, -1.4264, -6.1339, 1.8885, -0.1983, 0.9461, -1.7414, 0.3320, -3.8400, -1.5715, 1.5096, -1.7576, 0.2387, 4.9758, 5.8450, -6.2534, 2.8587, -5.5816, 4.7821])
        # fmt: on

        torch.testing.assert_close(speech_embeds[0, :20], EXPECTED_SPEECH_EMBEDS, rtol=1e-4, atol=1e-4)

    def test_full_model_integration(self):
        full_model_output = self.model.generate(
            input_ids=self.text_tokens,
            input_features=self.input_features,
            do_sample=False,
            num_beams=4,
            num_return_sequences=4,
            max_new_tokens=10,
        )

        EXPECTED_SPEECH_IDS = torch.tensor([[1953, 1080, 612], [1953, 612, 493], [1953, 612, 716]])
        EXPECTED_SIMILARITY_SCORES = torch.tensor([[14.7660, 14.4569, 13.6472, 13.5683]])

        torch.testing.assert_close(full_model_output.speech_ids.cpu()[-3:, -3:], EXPECTED_SPEECH_IDS)
        torch.testing.assert_close(full_model_output.logits_per_text.cpu(), EXPECTED_SIMILARITY_SCORES)