[phi4] update conversion (#37579)

* update conversion

* update
This commit is contained in:
Raushan Turganbay 2025-04-17 15:43:04 +02:00 committed by GitHub
parent 58e5e976e0
commit 48dd89cf55
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -21,14 +21,19 @@ from peft import LoraConfig
from safetensors.torch import load_file, save_file from safetensors.torch import load_file, save_file
from transformers import ( from transformers import (
AutoProcessor,
Phi4MultimodalAudioConfig, Phi4MultimodalAudioConfig,
Phi4MultimodalConfig, Phi4MultimodalConfig,
Phi4MultimodalFeatureExtractor,
Phi4MultimodalForCausalLM, Phi4MultimodalForCausalLM,
Phi4MultimodalImageProcessorFast,
Phi4MultimodalProcessor, Phi4MultimodalProcessor,
Phi4MultimodalVisionConfig, Phi4MultimodalVisionConfig,
) )
CHAT_TEMPLATE = "{% for message in messages %}{{ '<|' + message['role'] + '|>' }}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ '<|image|>' }}{% elif content['type'] == 'audio' %}{{ '<|audio|>' }}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% endif %}{{ '<|end|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}"
# fmt: off # fmt: off
STATE_DICT_MAPPING = { STATE_DICT_MAPPING = {
r"^model.embed_tokens_extend.audio_embed.encoder.encoders.(\d+).feed_forward_(in|out).net.0.linear": r"model.embed_tokens_extend.audio_embed.encoder.encoders.\1.feed_forward_\2.gate_up_proj", r"^model.embed_tokens_extend.audio_embed.encoder.encoders.(\d+).feed_forward_(in|out).net.0.linear": r"model.embed_tokens_extend.audio_embed.encoder.encoders.\1.feed_forward_\2.gate_up_proj",
@ -163,12 +168,36 @@ def convert_and_write_model(input_dir: str, output_dir: str):
def convert_and_save_processor(input_dir: str, output_dir: str): def convert_and_save_processor(input_dir: str, output_dir: str):
"""Convert the processor.""" """Convert the processor."""
processor = Phi4MultimodalProcessor.from_pretrained(input_dir) original_processor = AutoProcessor.from_pretrained(input_dir, trust_remote_code=True)
del processor.image_processor.auto_map original_processor.tokenizer.extra_special_tokens = {"image_token": "<|image|>", "audio_token": "<|audio|>"}
del processor.audio_processor.auto_map converted_processor = Phi4MultimodalProcessor(
processor.chat_template = processor.tokenizer.chat_template tokenizer=original_processor.tokenizer,
processor.tokenizer.extra_special_tokens = {"image_token": "<|endoftext10|>", "audio_token": "<|endoftext11|>"} image_processor=Phi4MultimodalImageProcessorFast(),
processor.save_pretrained(output_dir) audio_processor=Phi4MultimodalFeatureExtractor(),
chat_template=CHAT_TEMPLATE,
)
converted_processor.save_pretrained(output_dir)
# we need to rename a few tokens but tokenizers doesn't allow doing that programatically
# To avoid consufion and manual renaming, the below part load and re-saved each json file
vocab = json.load(open(f"{output_dir}/vocab.json", "r"))
vocab["<|endoftext11|>"] = "<|audio|>"
vocab["<|endoftext10|>"] = "<|image|>"
json.dump(vocab, open(f"{output_dir}/vocab.json", "w"))
tokenizer = json.load(open(f"{output_dir}/tokenizer.json", "r"))
tokenizer["added_tokens"][1]["content"] = "<|image|>"
tokenizer["added_tokens"][2]["content"] = "<|audio|>"
tokenizer["model"]["vocab"]["<|audio|>"] = tokenizer["model"]["vocab"]["<|endoftext11|>"]
tokenizer["model"]["vocab"]["<|image|>"] = tokenizer["model"]["vocab"]["<|endoftext10|>"]
del tokenizer["model"]["vocab"]["<|endoftext11|>"]
del tokenizer["model"]["vocab"]["<|endoftext10|>"]
json.dump(tokenizer, open(f"{output_dir}/tokenizer.json", "w"))
tokenizer_config = json.load(open(f"{output_dir}/tokenizer_config.json", "r"))
tokenizer_config["added_tokens_decoder"]["200010"]["content"] = "<|image|>"
tokenizer_config["added_tokens_decoder"]["200011"]["content"] = "<|audio|>"
json.dump(tokenizer_config, open(f"{output_dir}/tokenizer_config.json", "w"))
def extract_adapters_data(input_dir: str, output_dir: str): def extract_adapters_data(input_dir: str, output_dir: str):