From 2c22bc79c28feab9070e9c4e25c8577e0d05be8e Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Thu, 6 Apr 2023 17:45:55 +0200 Subject: [PATCH] Make tiny model creation + pipeline testing more robust (#22500) * Final Tiny things --------- Co-authored-by: ydshieh --- .github/workflows/check_tiny_models.yml | 82 +++++++++++++++++++ .github/workflows/update_tiny_models.yml | 47 ----------- tests/models/led/test_modeling_led.py | 9 ++ .../models/nllb_moe/test_modeling_nllb_moe.py | 7 ++ .../models/splinter/test_modeling_splinter.py | 2 + tests/test_pipeline_mixin.py | 19 ++++- utils/create_dummy_models.py | 49 +++++++---- utils/update_tiny_models.py | 10 +++ 8 files changed, 161 insertions(+), 64 deletions(-) create mode 100644 .github/workflows/check_tiny_models.yml delete mode 100644 .github/workflows/update_tiny_models.yml diff --git a/.github/workflows/check_tiny_models.yml b/.github/workflows/check_tiny_models.yml new file mode 100644 index 00000000000..5a4cb9622f0 --- /dev/null +++ b/.github/workflows/check_tiny_models.yml @@ -0,0 +1,82 @@ +name: Check Tiny Models + +on: + push: + branches: + - check_tiny_models* + repository_dispatch: + schedule: + - cron: "0 2 * * *" + +env: + TOKEN: ${{ secrets.TRANSFORMERS_HUB_BOT_HF_TOKEN }} + +jobs: + check_tiny_models: + name: Check tiny models + runs-on: ubuntu-latest + steps: + - name: Checkout transformers + uses: actions/checkout@v3 + with: + fetch-depth: 2 + + - uses: actions/checkout@v3 + - name: Set up Python 3.8 + uses: actions/setup-python@v4 + with: + # Semantic version range syntax or exact version of a Python version + python-version: '3.8' + # Optional - x64 or x86 architecture, defaults to x64 + architecture: 'x64' + + - name: Install + run: | + sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng cmake + pip install --upgrade pip + python -m pip install -U .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm,video,tf-cpu] + pip install tensorflow_probability + python -m pip install -U natten + + - name: Create all tiny models (locally) + run: | + python utils/create_dummy_models.py tiny_local_models --all --num_workers 2 + + - name: Local tiny model reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: tiny_local_model_creation_reports + path: tiny_local_models/reports + + # GitHub-hosted runners have 2-core CPUs + - name: Run pipeline tests against all new (local) tiny models + run: | + OMP_NUM_THREADS=1 TRANSFORMERS_TINY_MODEL_PATH=tiny_local_models python -m pytest --max-worker-restart=0 -n 2 --dist=loadfile -s -rA --make-reports=tests_pipelines tests/models -m is_pipeline_test -k "test_pipeline_" | tee tests_output.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: tiny_local_model_creation_reports + path: reports/tests_pipelines + + - name: Create + Upload tiny models for new model architecture(s) + run: | + python utils/update_tiny_models.py --num_workers 2 + + - name: Full report + run: cat tiny_models/reports/tiny_model_creation_report.json + + - name: Failure report + run: cat tiny_models/reports/simple_failed_report.txt + + - name: Summary report + run: cat tiny_models/reports/tiny_model_summary.json + + - name: New tiny model creation reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: tiny_model_creation_reports + path: tiny_models/reports diff --git a/.github/workflows/update_tiny_models.yml b/.github/workflows/update_tiny_models.yml deleted file mode 100644 index bf70c4f9372..00000000000 --- a/.github/workflows/update_tiny_models.yml +++ /dev/null @@ -1,47 +0,0 @@ -name: Update Tiny Models - -on: - push: - branches: - - update_tiny_models* - repository_dispatch: - schedule: - - cron: "0 2 * * *" - -env: - TOKEN: ${{ secrets.TRANSFORMERS_HUB_BOT_HF_TOKEN }} - -jobs: - update_tiny_models: - name: Update tiny models - runs-on: ubuntu-latest - steps: - - name: Checkout transformers - uses: actions/checkout@v3 - with: - fetch-depth: 2 - - - name: Install - run: | - python -m pip install -U .[dev] - python -m pip install -U natten - - - name: Update tiny models - run: | - python utils/update_tiny_models.py - - - name: Full report - run: cat tiny_models/reports/tiny_model_creation_report.json - - - name: Failure report - run: cat tiny_models/reports/simple_failed_report.txt - - - name: Summary report - run: cat tiny_models/reports/tiny_model_summary.json - - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: tiny_model_creation_reports - path: tiny_models/reports diff --git a/tests/models/led/test_modeling_led.py b/tests/models/led/test_modeling_led.py index 371ebedadff..31c78eacc64 100644 --- a/tests/models/led/test_modeling_led.py +++ b/tests/models/led/test_modeling_led.py @@ -294,6 +294,15 @@ class LEDModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, test_missing_keys = False test_torchscript = False + # TODO: Fix the failed tests when this model gets more usage + def is_pipeline_test_to_skip( + self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name + ): + if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"): + return True + + return False + def setUp(self): self.model_tester = LEDModelTester(self) self.config_tester = ConfigTester(self, config_class=LEDConfig) diff --git a/tests/models/nllb_moe/test_modeling_nllb_moe.py b/tests/models/nllb_moe/test_modeling_nllb_moe.py index 76cf4c0ea48..9f072a06d2e 100644 --- a/tests/models/nllb_moe/test_modeling_nllb_moe.py +++ b/tests/models/nllb_moe/test_modeling_nllb_moe.py @@ -265,6 +265,13 @@ class NllbMoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi test_missing_keys = True test_torchscript = False + # TODO: Fix the failed tests when this model gets more usage + def is_pipeline_test_to_skip( + self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name + ): + # Saving the slow tokenizer after saving the fast tokenizer causes the loading of the later hanging forever. + return True + def setUp(self): self.model_tester = NllbMoeModelTester(self) self.config_tester = ConfigTester(self, config_class=NllbMoeConfig) diff --git a/tests/models/splinter/test_modeling_splinter.py b/tests/models/splinter/test_modeling_splinter.py index bf234b9e61e..24a0753157b 100644 --- a/tests/models/splinter/test_modeling_splinter.py +++ b/tests/models/splinter/test_modeling_splinter.py @@ -230,6 +230,8 @@ class SplinterModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase ): if pipeline_test_casse_name == "QAPipelineTests": return True + elif pipeline_test_casse_name == "FeatureExtractionPipelineTests" and tokenizer_name.endswith("Fast"): + return True return False diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py index c047b959d5e..82a23a94b40 100644 --- a/tests/test_pipeline_mixin.py +++ b/tests/test_pipeline_mixin.py @@ -93,7 +93,14 @@ for task, task_info in pipeline_test_mapping.items(): } -TINY_MODEL_SUMMARY_FILE_PATH = os.path.join(Path(__file__).parent.parent, "tests/utils/tiny_model_summary.json") +# The default value `hf-internal-testing` is for running the pipeline testing against the tiny models on the Hub. +# For debugging purpose, we can specify a local path which is the `output_path` argument of a previous run of +# `utils/create_dummy_models.py`. +TRANSFORMERS_TINY_MODEL_PATH = os.environ.get("TRANSFORMERS_TINY_MODEL_PATH", "hf-internal-testing") +if TRANSFORMERS_TINY_MODEL_PATH == "hf-internal-testing": + TINY_MODEL_SUMMARY_FILE_PATH = os.path.join(Path(__file__).parent.parent, "tests/utils/tiny_model_summary.json") +else: + TINY_MODEL_SUMMARY_FILE_PATH = os.path.join(TRANSFORMERS_TINY_MODEL_PATH, "reports", "tiny_model_summary.json") with open(TINY_MODEL_SUMMARY_FILE_PATH) as fp: tiny_model_summary = json.load(fp) @@ -146,12 +153,15 @@ class PipelineTesterMixin: if model_arch_name in tiny_model_summary: tokenizer_names = tiny_model_summary[model_arch_name]["tokenizer_classes"] processor_names = tiny_model_summary[model_arch_name]["processor_classes"] - commit = tiny_model_summary[model_arch_name]["sha"] + if "sha" in tiny_model_summary[model_arch_name]: + commit = tiny_model_summary[model_arch_name]["sha"] # Adding `None` (if empty) so we can generate tests tokenizer_names = [None] if len(tokenizer_names) == 0 else tokenizer_names processor_names = [None] if len(processor_names) == 0 else processor_names repo_name = f"tiny-random-{model_arch_name}" + if TRANSFORMERS_TINY_MODEL_PATH != "hf-internal-testing": + repo_name = model_arch_name self.run_model_pipeline_tests( task, repo_name, model_architecture, tokenizer_names, processor_names, commit @@ -210,7 +220,10 @@ class PipelineTesterMixin: processor_name (`str`): The name of a subclass of `BaseImageProcessor` or `FeatureExtractionMixin`. """ - repo_id = f"hf-internal-testing/{repo_name}" + repo_id = f"{TRANSFORMERS_TINY_MODEL_PATH}/{repo_name}" + if TRANSFORMERS_TINY_MODEL_PATH != "hf-internal-testing": + model_type = model_architecture.config_class.model_type + repo_id = os.path.join(TRANSFORMERS_TINY_MODEL_PATH, model_type, repo_name) tokenizer = None if tokenizer_name is not None: diff --git a/utils/create_dummy_models.py b/utils/create_dummy_models.py index cba13c12d3c..a9ea64c58c8 100644 --- a/utils/create_dummy_models.py +++ b/utils/create_dummy_models.py @@ -18,6 +18,7 @@ import collections.abc import copy import inspect import json +import multiprocessing import os import shutil import tempfile @@ -679,12 +680,22 @@ def convert_processors(processors, tiny_config, output_folder, result): if hasattr(tiny_config, "max_position_embeddings") and tiny_config.max_position_embeddings > 0: if fast_tokenizer is not None: - if fast_tokenizer.__class__.__name__ in ["RobertaTokenizerFast", "XLMRobertaTokenizerFast"]: + if fast_tokenizer.__class__.__name__ in [ + "RobertaTokenizerFast", + "XLMRobertaTokenizerFast", + "LongformerTokenizerFast", + "MPNetTokenizerFast", + ]: fast_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2 else: fast_tokenizer.model_max_length = tiny_config.max_position_embeddings if slow_tokenizer is not None: - if slow_tokenizer.__class__.__name__ in ["RobertaTokenizer", "XLMRobertaTokenizer"]: + if slow_tokenizer.__class__.__name__ in [ + "RobertaTokenizer", + "XLMRobertaTokenizer", + "LongformerTokenizer", + "MPNetTokenizer", + ]: slow_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2 else: slow_tokenizer.model_max_length = tiny_config.max_position_embeddings @@ -1047,6 +1058,10 @@ def build(config_class, models_to_create, output_dir): The directory to save all the checkpoints. Each model architecture will be saved in a subdirectory under it. Models in different frameworks with the same architecture will be saved in the same subdirectory. """ + if data["training_ds"] is None or data["testing_ds"] is None: + ds = load_dataset("wikitext", "wikitext-2-raw-v1") + data["training_ds"] = ds["train"] + data["testing_ds"] = ds["test"] if config_class.model_type in [ "encoder-decoder", @@ -1323,6 +1338,7 @@ def create_tiny_models( upload, organization, token, + num_workers=1, ): clone_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) if os.getcwd() != clone_path: @@ -1343,10 +1359,6 @@ def create_tiny_models( pytorch_arch_mappings = [getattr(transformers_module, x) for x in _pytorch_arch_mappings] tensorflow_arch_mappings = [getattr(transformers_module, x) for x in _tensorflow_arch_mappings] - ds = load_dataset("wikitext", "wikitext-2-raw-v1") - data["training_ds"] = ds["train"] - data["testing_ds"] = ds["test"] - config_classes = CONFIG_MAPPING.values() if not all: config_classes = [CONFIG_MAPPING[model_type] for model_type in model_types] @@ -1363,11 +1375,19 @@ def create_tiny_models( to_create[c] = {"processor": processors, "pytorch": models, "tensorflow": tf_models} results = {} - for c, models_to_create in list(to_create.items()): - print(f"Create models for {c.__name__} ...") - result = build(c, models_to_create, output_dir=os.path.join(output_path, c.model_type)) - results[c.__name__] = result - print("=" * 40) + if num_workers <= 1: + for c, models_to_create in list(to_create.items()): + print(f"Create models for {c.__name__} ...") + result = build(c, models_to_create, output_dir=os.path.join(output_path, c.model_type)) + results[c.__name__] = result + print("=" * 40) + else: + all_build_args = [] + for c, models_to_create in list(to_create.items()): + all_build_args.append((c, models_to_create, os.path.join(output_path, c.model_type))) + with multiprocessing.Pool() as pool: + results = pool.starmap(build, all_build_args) + results = {buid_args[0].__name__: result for buid_args, result in zip(all_build_args, results)} if upload: if organization is None: @@ -1426,9 +1446,8 @@ def create_tiny_models( if __name__ == "__main__": - ds = load_dataset("wikitext", "wikitext-2-raw-v1") - training_ds = ds["train"] - testing_ds = ds["test"] + # This has to be `spawn` to avoid hanging forever! + multiprocessing.set_start_method("spawn") def list_str(values): return values.split(",") @@ -1465,6 +1484,7 @@ if __name__ == "__main__": "--token", default=None, type=str, help="A valid authentication token for HuggingFace Hub with write access." ) parser.add_argument("output_path", type=Path, help="Path indicating where to store generated model.") + parser.add_argument("--num_workers", default=1, type=int, help="The number of workers to run.") args = parser.parse_args() @@ -1480,4 +1500,5 @@ if __name__ == "__main__": args.upload, args.organization, args.token, + args.num_workers, ) diff --git a/utils/update_tiny_models.py b/utils/update_tiny_models.py index 1b53737ed73..997da679840 100644 --- a/utils/update_tiny_models.py +++ b/utils/update_tiny_models.py @@ -21,8 +21,10 @@ version of `tests/utils/tiny_model_summary.json`. That updated file should be me """ +import argparse import copy import json +import multiprocessing import os import time @@ -197,6 +199,13 @@ def update_tiny_model_summary_file(report_path): if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--num_workers", default=1, type=int, help="The number of workers to run.") + args = parser.parse_args() + + # This has to be `spawn` to avoid hanging forever! + multiprocessing.set_start_method("spawn") + output_path = "tiny_models" all = True model_types = None @@ -214,6 +223,7 @@ if __name__ == "__main__": upload, organization, token=os.environ.get("TOKEN", None), + num_workers=args.num_workers, ) update_tiny_model_summary_file(report_path=os.path.join(output_path, "reports"))