From 48dd89cf55e89129d3374af6a93f6481283d0f98 Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan@huggingface.co>
Date: Thu, 17 Apr 2025 15:43:04 +0200
Subject: [PATCH] [phi4] update conversion (#37579)

* update conversion

* update
---
 .../convert_phi4_multimodal_weights_to_hf.py  | 41 ++++++++++++++++---
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/phi4_multimodal/convert_phi4_multimodal_weights_to_hf.py b/src/transformers/models/phi4_multimodal/convert_phi4_multimodal_weights_to_hf.py
index c7cae2ab007..65ced8db26c 100644
--- a/src/transformers/models/phi4_multimodal/convert_phi4_multimodal_weights_to_hf.py
+++ b/src/transformers/models/phi4_multimodal/convert_phi4_multimodal_weights_to_hf.py
@@ -21,14 +21,19 @@ from peft import LoraConfig
 from safetensors.torch import load_file, save_file
 
 from transformers import (
+    AutoProcessor,
     Phi4MultimodalAudioConfig,
     Phi4MultimodalConfig,
+    Phi4MultimodalFeatureExtractor,
     Phi4MultimodalForCausalLM,
+    Phi4MultimodalImageProcessorFast,
     Phi4MultimodalProcessor,
     Phi4MultimodalVisionConfig,
 )
 
 
+CHAT_TEMPLATE = "{% for message in messages %}{{ '<|' + message['role'] + '|>' }}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ '<|image|>' }}{% elif content['type'] == 'audio' %}{{ '<|audio|>' }}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% endif %}{{ '<|end|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}"
+
 # fmt: off
 STATE_DICT_MAPPING = {
     r"^model.embed_tokens_extend.audio_embed.encoder.encoders.(\d+).feed_forward_(in|out).net.0.linear": r"model.embed_tokens_extend.audio_embed.encoder.encoders.\1.feed_forward_\2.gate_up_proj",
@@ -163,12 +168,36 @@ def convert_and_write_model(input_dir: str, output_dir: str):
 
 def convert_and_save_processor(input_dir: str, output_dir: str):
     """Convert the processor."""
-    processor = Phi4MultimodalProcessor.from_pretrained(input_dir)
-    del processor.image_processor.auto_map
-    del processor.audio_processor.auto_map
-    processor.chat_template = processor.tokenizer.chat_template
-    processor.tokenizer.extra_special_tokens = {"image_token": "<|endoftext10|>", "audio_token": "<|endoftext11|>"}
-    processor.save_pretrained(output_dir)
+    original_processor = AutoProcessor.from_pretrained(input_dir, trust_remote_code=True)
+    original_processor.tokenizer.extra_special_tokens = {"image_token": "<|image|>", "audio_token": "<|audio|>"}
+    converted_processor = Phi4MultimodalProcessor(
+        tokenizer=original_processor.tokenizer,
+        image_processor=Phi4MultimodalImageProcessorFast(),
+        audio_processor=Phi4MultimodalFeatureExtractor(),
+        chat_template=CHAT_TEMPLATE,
+    )
+    converted_processor.save_pretrained(output_dir)
+
+    # we need to rename a few tokens but tokenizers doesn't allow doing that programatically
+    # To avoid consufion and manual renaming, the below part load and re-saved each json file
+    vocab = json.load(open(f"{output_dir}/vocab.json", "r"))
+    vocab["<|endoftext11|>"] = "<|audio|>"
+    vocab["<|endoftext10|>"] = "<|image|>"
+    json.dump(vocab, open(f"{output_dir}/vocab.json", "w"))
+
+    tokenizer = json.load(open(f"{output_dir}/tokenizer.json", "r"))
+    tokenizer["added_tokens"][1]["content"] = "<|image|>"
+    tokenizer["added_tokens"][2]["content"] = "<|audio|>"
+    tokenizer["model"]["vocab"]["<|audio|>"] = tokenizer["model"]["vocab"]["<|endoftext11|>"]
+    tokenizer["model"]["vocab"]["<|image|>"] = tokenizer["model"]["vocab"]["<|endoftext10|>"]
+    del tokenizer["model"]["vocab"]["<|endoftext11|>"]
+    del tokenizer["model"]["vocab"]["<|endoftext10|>"]
+    json.dump(tokenizer, open(f"{output_dir}/tokenizer.json", "w"))
+
+    tokenizer_config = json.load(open(f"{output_dir}/tokenizer_config.json", "r"))
+    tokenizer_config["added_tokens_decoder"]["200010"]["content"] = "<|image|>"
+    tokenizer_config["added_tokens_decoder"]["200011"]["content"] = "<|audio|>"
+    json.dump(tokenizer_config, open(f"{output_dir}/tokenizer_config.json", "w"))
 
 
 def extract_adapters_data(input_dir: str, output_dir: str):