# Copyright 2025 The BitNet team and The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Testing suite for the PyTorch BitNet model.""" import gc import unittest import pytest from transformers import AutoTokenizer, BitNetConfig, is_torch_available from transformers.testing_utils import ( backend_empty_cache, require_flash_attn, require_torch, require_torch_gpu, slow, torch_device, ) from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester from ...test_modeling_common import ModelTesterMixin, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin if is_torch_available(): import torch from transformers import ( BitNetForCausalLM, BitNetModel, ) class BitNetModelTester: def __init__( self, parent, batch_size=13, seq_length=7, is_training=True, use_input_mask=True, vocab_size=99, hidden_size=64, num_hidden_layers=5, num_attention_heads=4, num_key_value_heads=2, intermediate_size=37, hidden_act="gelu", max_position_embeddings=512, initializer_range=0.02, pad_token_id=0, bos_token_id=1, scope=None, ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length self.is_training = is_training self.use_input_mask = use_input_mask self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.num_key_value_heads = num_key_value_heads self.intermediate_size = intermediate_size self.hidden_act = hidden_act self.max_position_embeddings = max_position_embeddings self.initializer_range = initializer_range self.pad_token_id = pad_token_id self.bos_token_id = bos_token_id self.scope = scope def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_mask = None if self.use_input_mask: input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device)) config = self.get_config() return config, input_ids, input_mask def get_config(self): return BitNetConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, num_key_value_heads=self.num_key_value_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, max_position_embeddings=self.max_position_embeddings, initializer_range=self.initializer_range, pad_token_id=self.pad_token_id, bos_token_id=self.bos_token_id, ) def create_and_check_model(self, config, input_ids, input_mask): model = BitNetModel(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask) result = model(input_ids) self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() ( config, input_ids, input_mask, ) = config_and_inputs inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} return config, inputs_dict @require_torch class BitNetModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = ( ( BitNetModel, BitNetForCausalLM, ) if is_torch_available() else () ) pipeline_model_mapping = ( { "feature-extraction": BitNetModel, "text-generation": BitNetForCausalLM, } if is_torch_available() else {} ) test_headmasking = False test_pruning = False fx_compatible = False # Broken by attention refactor cc @Cyrilvallez # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146 def is_pipeline_test_to_skip( self, pipeline_test_case_name, config_class, model_architecture, tokenizer_name, image_processor_name, feature_extractor_name, processor_name, ): return True def setUp(self): self.model_tester = BitNetModelTester(self) self.config_tester = ConfigTester(self, config_class=BitNetConfig, hidden_size=37) def test_config(self): self.config_tester.run_common_tests() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) def test_model_various_embeddings(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() for type in ["absolute", "relative_key", "relative_key_query"]: config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) def test_torch_fx_output_loss(self): super().test_torch_fx_output_loss() # Ignore copy def test_past_key_values_format(self): super().test_past_key_values_format() @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test @slow def test_flash_attn_2_inference_equivalence_right_padding(self): self.skipTest(reason="BitNet flash attention does not support right padding") @require_torch class BitNetIntegrationTest(unittest.TestCase): @slow def test_model_logits(self): input_ids = [128000, 128000, 1502, 25, 2650, 527, 499, 30, 128009, 72803, 25, 220] model = BitNetForCausalLM.from_pretrained("microsoft/bitnet-b1.58-2B-4T") input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device) with torch.no_grad(): out = model(input_ids).logits.float().cpu() # Expected mean on dim = -1 EXPECTED_MEAN = torch.tensor( [ [ -1.8665, -1.7681, -1.7043, 3.7446, 2.7730, 4.7133, 0.9768, -3.5018, -12.2812, -8.1477, -10.2571, -8.7610, ] ] ) torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, rtol=1e-2, atol=1e-2) # slicing logits[0, 0, 0:30] EXPECTED_SLICE = torch.tensor([5.5815, 4.9154, 1.0478, 4.3869, 3.0112, 0.8235, 3.8412, 2.9233, 8.1140, 1.9406, 1.7973, 10.5025, 4.7796, 8.5926, 4.5196, 3.1549, 3.2656, 3.2588, 2.7356, 2.6032, 2.1454, 1.5683, 1.3465, 1.5329, 1.1886, 7.7902, 5.9326, 1.4737, 3.3319, 1.6291]) # fmt: skip torch.testing.assert_close(out[0, 0, :30], EXPECTED_SLICE, rtol=1e-4, atol=1e-4) del model backend_empty_cache(torch_device) gc.collect() @slow def test_model_generation(self): EXPECTED_TEXT_COMPLETION = """User: What is your favourite food?Assistant: As an AI, I don't have personal preferences or the ability to eat food. However, I""" tokenizer = AutoTokenizer.from_pretrained("microsoft/bitnet-b1.58-2B-4T") prompt = tokenizer.apply_chat_template( [{"role": "user", "content": "What is your favourite food?"}], add_generation_prompt=True, tokenize=False ) model = BitNetForCausalLM.from_pretrained( "microsoft/bitnet-b1.58-2B-4T", device_map="auto", torch_dtype=torch.bfloat16 ) input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.model.embed_tokens.weight.device) # greedy generation outputs generated_ids = model.generate(input_ids, max_new_tokens=20, do_sample=False) text = tokenizer.decode(generated_ids[0], skip_special_tokens=True) self.assertEqual(EXPECTED_TEXT_COMPLETION, text) del model backend_empty_cache(torch_device) gc.collect()