diff --git a/.gitignore b/.gitignore index 337f2ef2c73..cdf189505dc 100644 --- a/.gitignore +++ b/.gitignore @@ -167,3 +167,6 @@ tags # ruff .ruff_cache + +# modular conversion +*.modular_backup diff --git a/tests/repo_utils/modular/test_conversion_order.py b/tests/repo_utils/modular/test_conversion_order.py index 265873c604c..65d31c2203c 100644 --- a/tests/repo_utils/modular/test_conversion_order.py +++ b/tests/repo_utils/modular/test_conversion_order.py @@ -38,20 +38,29 @@ FILES_TO_PARSE = [ os.path.join(MODEL_ROOT, "mistral", "modular_mistral.py"), os.path.join(MODEL_ROOT, "phi3", "modular_phi3.py"), os.path.join(MODEL_ROOT, "cohere", "modular_cohere.py"), + os.path.join(MODEL_ROOT, "glm4", "modular_glm4.py"), ] -def appear_after(model1: str, model2: str, priority_list: list[str]) -> bool: +def appear_after(model1: str, model2: str, priority_list: list[list[str]]) -> bool: """Return True if `model1` appear after `model2` in `priority_list`.""" - return priority_list.index(model1) > priority_list.index(model2) + model1_index, model2_index = None, None + for i, level in enumerate(priority_list): + if model1 in level: + model1_index = i + if model2 in level: + model2_index = i + if model1_index is None or model2_index is None: + raise ValueError(f"Model {model1} or {model2} not found in {priority_list}") + return model1_index > model2_index class ConversionOrderTest(unittest.TestCase): def test_conversion_order(self): # Find the order priority_list, _ = create_dependency_mapping.find_priority_list(FILES_TO_PARSE) - # Extract just the model names - model_priority_list = [file.rsplit("modular_")[-1].replace(".py", "") for file in priority_list] + # Extract just the model names (list of lists) + model_priority_list = [[file.split("/")[-2] for file in level] for level in priority_list] # These are based on what the current library order should be (as of 09/01/2025) self.assertTrue(appear_after("mixtral", "mistral", model_priority_list)) @@ -62,3 +71,4 @@ class ConversionOrderTest(unittest.TestCase): self.assertTrue(appear_after("cohere2", "gemma2", model_priority_list)) self.assertTrue(appear_after("cohere2", "cohere", model_priority_list)) self.assertTrue(appear_after("phi3", "mistral", model_priority_list)) + self.assertTrue(appear_after("glm4", "glm", model_priority_list)) diff --git a/utils/check_modular_conversion.py b/utils/check_modular_conversion.py index fa233d0cf2a..15dcf418d67 100644 --- a/utils/check_modular_conversion.py +++ b/utils/check_modular_conversion.py @@ -2,7 +2,11 @@ import argparse import difflib import glob import logging +import multiprocessing +import os +import shutil import subprocess +from functools import partial from io import StringIO from create_dependency_mapping import find_priority_list @@ -17,8 +21,15 @@ logging.basicConfig() logging.getLogger().setLevel(logging.ERROR) console = Console() +BACKUP_EXT = ".modular_backup" -def process_file(modular_file_path, generated_modeling_content, file_type="modeling_", fix_and_overwrite=False): + +def process_file( + modular_file_path, + generated_modeling_content, + file_type="modeling_", + show_diff=True, +): file_name_prefix = file_type.split("*")[0] file_name_suffix = file_type.split("*")[-1] if "*" in file_type else "" file_path = modular_file_path.replace("modular_", f"{file_name_prefix}_").replace(".py", f"{file_name_suffix}.py") @@ -38,11 +49,14 @@ def process_file(modular_file_path, generated_modeling_content, file_type="model diff_list = list(diff) # Check for differences if diff_list: - if fix_and_overwrite: - with open(file_path, "w", encoding="utf-8", newline="\n") as modeling_file: - modeling_file.write(generated_modeling_content[file_type][0]) - console.print(f"[bold blue]Overwritten {file_path} with the generated content.[/bold blue]") - else: + # first save the copy of the original file, to be able to restore it later + if os.path.exists(file_path): + shutil.copy(file_path, file_path + BACKUP_EXT) + # we always save the generated content, to be able to update dependant files + with open(file_path, "w", encoding="utf-8", newline="\n") as modeling_file: + modeling_file.write(generated_modeling_content[file_type][0]) + console.print(f"[bold blue]Overwritten {file_path} with the generated content.[/bold blue]") + if show_diff: console.print(f"\n[bold red]Differences found between the generated code and {file_path}:[/bold red]\n") diff_text = "\n".join(diff_list) syntax = Syntax(diff_text, "diff", theme="ansi_dark", line_numbers=True) @@ -53,12 +67,12 @@ def process_file(modular_file_path, generated_modeling_content, file_type="model return 0 -def compare_files(modular_file_path, fix_and_overwrite=False): +def compare_files(modular_file_path, show_diff=True): # Generate the expected modeling content generated_modeling_content = convert_modular_file(modular_file_path) diff = 0 for file_type in generated_modeling_content.keys(): - diff += process_file(modular_file_path, generated_modeling_content, file_type, fix_and_overwrite) + diff += process_file(modular_file_path, generated_modeling_content, file_type, show_diff) return diff @@ -120,16 +134,20 @@ if __name__ == "__main__": parser.add_argument( "--fix_and_overwrite", action="store_true", help="Overwrite the modeling_xxx.py file if differences are found." ) + parser.add_argument("--check_all", action="store_true", help="Check all files, not just the ones in the diff.") parser.add_argument( "--num_workers", - default=1, + default=-1, type=int, - help="The number of workers to run. No effect if `fix_and_overwrite` is specified.", + help="The number of workers to run. Default is -1, which means the number of CPU cores.", ) args = parser.parse_args() if args.files == ["all"]: args.files = glob.glob("src/transformers/models/**/modular_*.py", recursive=True) + if args.num_workers == -1: + args.num_workers = multiprocessing.cpu_count() + # Assuming there is a topological sort on the dependency mapping: if the file being checked and its dependencies # are not in the diff, then there it is guaranteed to have no differences. If no models are in the diff, then this # script will do nothing. @@ -141,44 +159,71 @@ if __name__ == "__main__": models_in_diff = {file_path.split("/")[-2] for file_path in args.files} else: models_in_diff = get_models_in_diff() - if not models_in_diff: + if not models_in_diff and not args.check_all: console.print( "[bold green]No models files or model tests in the diff, skipping modular checks[/bold green]" ) exit(0) skipped_models = set() - non_matching_files = 0 + non_matching_files = [] ordered_files, dependencies = find_priority_list(args.files) - if args.fix_and_overwrite or args.num_workers == 1: - for modular_file_path in ordered_files: - is_guaranteed_no_diff = guaranteed_no_diff(modular_file_path, dependencies, models_in_diff) - if is_guaranteed_no_diff: - model_name = modular_file_path.rsplit("modular_", 1)[1].replace(".py", "") - skipped_models.add(model_name) + flat_ordered_files = [item for sublist in ordered_files for item in sublist] + + # ordered_files is a *sorted* list of lists of filepaths + # - files from the first list do NOT depend on other files + # - files in the second list depend on files from the first list + # - files in the third list depend on files from the second and (optionally) the first list + # - ... and so on + # files (models) within the same list are *independent* of each other; + # we start applying modular conversion to each list in parallel, starting from the first list + + console.print(f"[bold yellow]Number of dependency levels: {len(ordered_files)}[/bold yellow]") + console.print(f"[bold yellow]Files per level: {tuple([len(x) for x in ordered_files])}[/bold yellow]") + + try: + for dependency_level_files in ordered_files: + # Filter files guaranteed no diff + files_to_check = [] + for file_path in dependency_level_files: + if not args.check_all and guaranteed_no_diff(file_path, dependencies, models_in_diff): + skipped_models.add(file_path.split("/")[-2]) # save model folder name + else: + files_to_check.append(file_path) + + if not files_to_check: continue - non_matching_files += compare_files(modular_file_path, args.fix_and_overwrite) - if current_branch != "main": - models_in_diff = get_models_in_diff() # When overwriting, the diff changes - else: - new_ordered_files = [] - for modular_file_path in ordered_files: - is_guaranteed_no_diff = guaranteed_no_diff(modular_file_path, dependencies, models_in_diff) - if is_guaranteed_no_diff: - model_name = modular_file_path.rsplit("modular_", 1)[1].replace(".py", "") - skipped_models.add(model_name) - else: - new_ordered_files.append(modular_file_path) - import multiprocessing + # Process files with diff + num_workers = min(args.num_workers, len(files_to_check)) + with multiprocessing.Pool(num_workers) as p: + is_changed_flags = p.map( + partial(compare_files, show_diff=not args.fix_and_overwrite), + files_to_check, + ) - with multiprocessing.Pool(args.num_workers) as p: - outputs = p.map(compare_files, new_ordered_files) - for output in outputs: - non_matching_files += output + # Collect changed files and their original paths + for is_changed, file_path in zip(is_changed_flags, files_to_check): + if is_changed: + non_matching_files.append(file_path) + + # Update changed models, after each round of conversions + # (save model folder name) + models_in_diff.add(file_path.split("/")[-2]) + + finally: + # Restore overwritten files by modular (if needed) + backup_files = glob.glob("**/*" + BACKUP_EXT, recursive=True) + for backup_file_path in backup_files: + overwritten_path = backup_file_path.replace(BACKUP_EXT, "") + if not args.fix_and_overwrite and os.path.exists(overwritten_path): + shutil.copy(backup_file_path, overwritten_path) + os.remove(backup_file_path) if non_matching_files and not args.fix_and_overwrite: - raise ValueError("Some diff and their modeling code did not match.") + diff_models = set(file_path.split("/")[-2] for file_path in non_matching_files) # noqa + models_str = "\n - " + "\n - ".join(sorted(diff_models)) + raise ValueError(f"Some diff and their modeling code did not match. Models in diff:{models_str}") if skipped_models: console.print( diff --git a/utils/create_dependency_mapping.py b/utils/create_dependency_mapping.py index 782617e27c2..7590fda98c2 100644 --- a/utils/create_dependency_mapping.py +++ b/utils/create_dependency_mapping.py @@ -3,7 +3,19 @@ from collections import defaultdict # Function to perform topological sorting -def topological_sort(dependencies: dict): +def topological_sort(dependencies: dict) -> list[list[str]]: + """Given the dependencies graph construct sorted list of list of modular files + + For example, returned list of lists might be: + [ + ["../modular_llama.py", "../modular_gemma.py"], # level 0 + ["../modular_llama4.py", "../modular_gemma2.py"], # level 1 + ["../modular_glm4.py"], # level 2 + ] + which means llama and gemma do not depend on any other modular models, while llama4 and gemma2 + depend on the models in the first list, and glm4 depends on the models in the second and (optionally) in the first list. + """ + # Nodes are the name of the models to convert (we only add those to the graph) nodes = {node.rsplit("modular_", 1)[1].replace(".py", "") for node in dependencies.keys()} # This will be a graph from models to convert, to models to convert that should be converted before (as they are a dependency) @@ -20,12 +32,12 @@ def topological_sort(dependencies: dict): while len(graph) > 0: # Find the nodes with 0 out-degree leaf_nodes = {node for node in graph if len(graph[node]) == 0} - # Add them to the list - sorting_list += list(leaf_nodes) + # Add them to the list as next level + sorting_list.append([name_mapping[node] for node in leaf_nodes]) # Remove the leafs from the graph (and from the deps of other nodes) graph = {node: deps - leaf_nodes for node, deps in graph.items() if node not in leaf_nodes} - return [name_mapping[x] for x in sorting_list] + return sorting_list # Function to extract class and import info from a file @@ -63,7 +75,16 @@ def find_priority_list(py_files): py_files: List of paths to the modular files Returns: - A tuple with the ordered files (list) and their dependencies (dict) + Ordered list of lists of files and their dependencies (dict) + + For example, ordered_files might be: + [ + ["../modular_llama.py", "../modular_gemma.py"], # level 0 + ["../modular_llama4.py", "../modular_gemma2.py"], # level 1 + ["../modular_glm4.py"], # level 2 + ] + which means llama and gemma do not depend on any other modular models, while llama4 and gemma2 + depend on the models in the first list, and glm4 depends on the models in the second and (optionally) in the first list. """ dependencies = map_dependencies(py_files) ordered_files = topological_sort(dependencies) diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py index e138277b03e..731839b0629 100644 --- a/utils/modular_model_converter.py +++ b/utils/modular_model_converter.py @@ -1775,6 +1775,7 @@ if __name__ == "__main__": files_to_parse[i] = full_path priority_list, _ = find_priority_list(files_to_parse) + priority_list = [item for sublist in priority_list for item in sublist] # flatten the list of lists assert len(priority_list) == len(files_to_parse), "Some files will not be converted" for file_name in priority_list: