From 48dd89cf55e89129d3374af6a93f6481283d0f98 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Thu, 17 Apr 2025 15:43:04 +0200 Subject: [PATCH] [phi4] update conversion (#37579) * update conversion * update --- .../convert_phi4_multimodal_weights_to_hf.py | 41 ++++++++++++++++--- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/phi4_multimodal/convert_phi4_multimodal_weights_to_hf.py b/src/transformers/models/phi4_multimodal/convert_phi4_multimodal_weights_to_hf.py index c7cae2ab007..65ced8db26c 100644 --- a/src/transformers/models/phi4_multimodal/convert_phi4_multimodal_weights_to_hf.py +++ b/src/transformers/models/phi4_multimodal/convert_phi4_multimodal_weights_to_hf.py @@ -21,14 +21,19 @@ from peft import LoraConfig from safetensors.torch import load_file, save_file from transformers import ( + AutoProcessor, Phi4MultimodalAudioConfig, Phi4MultimodalConfig, + Phi4MultimodalFeatureExtractor, Phi4MultimodalForCausalLM, + Phi4MultimodalImageProcessorFast, Phi4MultimodalProcessor, Phi4MultimodalVisionConfig, ) +CHAT_TEMPLATE = "{% for message in messages %}{{ '<|' + message['role'] + '|>' }}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ '<|image|>' }}{% elif content['type'] == 'audio' %}{{ '<|audio|>' }}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% endif %}{{ '<|end|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}" + # fmt: off STATE_DICT_MAPPING = { r"^model.embed_tokens_extend.audio_embed.encoder.encoders.(\d+).feed_forward_(in|out).net.0.linear": r"model.embed_tokens_extend.audio_embed.encoder.encoders.\1.feed_forward_\2.gate_up_proj", @@ -163,12 +168,36 @@ def convert_and_write_model(input_dir: str, output_dir: str): def convert_and_save_processor(input_dir: str, output_dir: str): """Convert the processor.""" - processor = Phi4MultimodalProcessor.from_pretrained(input_dir) - del processor.image_processor.auto_map - del processor.audio_processor.auto_map - processor.chat_template = processor.tokenizer.chat_template - processor.tokenizer.extra_special_tokens = {"image_token": "<|endoftext10|>", "audio_token": "<|endoftext11|>"} - processor.save_pretrained(output_dir) + original_processor = AutoProcessor.from_pretrained(input_dir, trust_remote_code=True) + original_processor.tokenizer.extra_special_tokens = {"image_token": "<|image|>", "audio_token": "<|audio|>"} + converted_processor = Phi4MultimodalProcessor( + tokenizer=original_processor.tokenizer, + image_processor=Phi4MultimodalImageProcessorFast(), + audio_processor=Phi4MultimodalFeatureExtractor(), + chat_template=CHAT_TEMPLATE, + ) + converted_processor.save_pretrained(output_dir) + + # we need to rename a few tokens but tokenizers doesn't allow doing that programatically + # To avoid consufion and manual renaming, the below part load and re-saved each json file + vocab = json.load(open(f"{output_dir}/vocab.json", "r")) + vocab["<|endoftext11|>"] = "<|audio|>" + vocab["<|endoftext10|>"] = "<|image|>" + json.dump(vocab, open(f"{output_dir}/vocab.json", "w")) + + tokenizer = json.load(open(f"{output_dir}/tokenizer.json", "r")) + tokenizer["added_tokens"][1]["content"] = "<|image|>" + tokenizer["added_tokens"][2]["content"] = "<|audio|>" + tokenizer["model"]["vocab"]["<|audio|>"] = tokenizer["model"]["vocab"]["<|endoftext11|>"] + tokenizer["model"]["vocab"]["<|image|>"] = tokenizer["model"]["vocab"]["<|endoftext10|>"] + del tokenizer["model"]["vocab"]["<|endoftext11|>"] + del tokenizer["model"]["vocab"]["<|endoftext10|>"] + json.dump(tokenizer, open(f"{output_dir}/tokenizer.json", "w")) + + tokenizer_config = json.load(open(f"{output_dir}/tokenizer_config.json", "r")) + tokenizer_config["added_tokens_decoder"]["200010"]["content"] = "<|image|>" + tokenizer_config["added_tokens_decoder"]["200011"]["content"] = "<|audio|>" + json.dump(tokenizer_config, open(f"{output_dir}/tokenizer_config.json", "w")) def extract_adapters_data(input_dir: str, output_dir: str):