Make tiny model creation + pipeline testing more robust (#22500)

* Final Tiny things

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
Yih-Dar 2023-04-06 17:45:55 +02:00 committed by GitHub
parent 12d51db243
commit 2c22bc79c2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 161 additions and 64 deletions

82
.github/workflows/check_tiny_models.yml vendored Normal file
View File

@ -0,0 +1,82 @@
name: Check Tiny Models
on:
push:
branches:
- check_tiny_models*
repository_dispatch:
schedule:
- cron: "0 2 * * *"
env:
TOKEN: ${{ secrets.TRANSFORMERS_HUB_BOT_HF_TOKEN }}
jobs:
check_tiny_models:
name: Check tiny models
runs-on: ubuntu-latest
steps:
- name: Checkout transformers
uses: actions/checkout@v3
with:
fetch-depth: 2
- uses: actions/checkout@v3
- name: Set up Python 3.8
uses: actions/setup-python@v4
with:
# Semantic version range syntax or exact version of a Python version
python-version: '3.8'
# Optional - x64 or x86 architecture, defaults to x64
architecture: 'x64'
- name: Install
run: |
sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng cmake
pip install --upgrade pip
python -m pip install -U .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm,video,tf-cpu]
pip install tensorflow_probability
python -m pip install -U natten
- name: Create all tiny models (locally)
run: |
python utils/create_dummy_models.py tiny_local_models --all --num_workers 2
- name: Local tiny model reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v3
with:
name: tiny_local_model_creation_reports
path: tiny_local_models/reports
# GitHub-hosted runners have 2-core CPUs
- name: Run pipeline tests against all new (local) tiny models
run: |
OMP_NUM_THREADS=1 TRANSFORMERS_TINY_MODEL_PATH=tiny_local_models python -m pytest --max-worker-restart=0 -n 2 --dist=loadfile -s -rA --make-reports=tests_pipelines tests/models -m is_pipeline_test -k "test_pipeline_" | tee tests_output.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v3
with:
name: tiny_local_model_creation_reports
path: reports/tests_pipelines
- name: Create + Upload tiny models for new model architecture(s)
run: |
python utils/update_tiny_models.py --num_workers 2
- name: Full report
run: cat tiny_models/reports/tiny_model_creation_report.json
- name: Failure report
run: cat tiny_models/reports/simple_failed_report.txt
- name: Summary report
run: cat tiny_models/reports/tiny_model_summary.json
- name: New tiny model creation reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v3
with:
name: tiny_model_creation_reports
path: tiny_models/reports

View File

@ -1,47 +0,0 @@
name: Update Tiny Models
on:
push:
branches:
- update_tiny_models*
repository_dispatch:
schedule:
- cron: "0 2 * * *"
env:
TOKEN: ${{ secrets.TRANSFORMERS_HUB_BOT_HF_TOKEN }}
jobs:
update_tiny_models:
name: Update tiny models
runs-on: ubuntu-latest
steps:
- name: Checkout transformers
uses: actions/checkout@v3
with:
fetch-depth: 2
- name: Install
run: |
python -m pip install -U .[dev]
python -m pip install -U natten
- name: Update tiny models
run: |
python utils/update_tiny_models.py
- name: Full report
run: cat tiny_models/reports/tiny_model_creation_report.json
- name: Failure report
run: cat tiny_models/reports/simple_failed_report.txt
- name: Summary report
run: cat tiny_models/reports/tiny_model_summary.json
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v3
with:
name: tiny_model_creation_reports
path: tiny_models/reports

View File

@ -294,6 +294,15 @@ class LEDModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
test_missing_keys = False test_missing_keys = False
test_torchscript = False test_torchscript = False
# TODO: Fix the failed tests when this model gets more usage
def is_pipeline_test_to_skip(
self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
):
if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
return True
return False
def setUp(self): def setUp(self):
self.model_tester = LEDModelTester(self) self.model_tester = LEDModelTester(self)
self.config_tester = ConfigTester(self, config_class=LEDConfig) self.config_tester = ConfigTester(self, config_class=LEDConfig)

View File

@ -265,6 +265,13 @@ class NllbMoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
test_missing_keys = True test_missing_keys = True
test_torchscript = False test_torchscript = False
# TODO: Fix the failed tests when this model gets more usage
def is_pipeline_test_to_skip(
self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
):
# Saving the slow tokenizer after saving the fast tokenizer causes the loading of the later hanging forever.
return True
def setUp(self): def setUp(self):
self.model_tester = NllbMoeModelTester(self) self.model_tester = NllbMoeModelTester(self)
self.config_tester = ConfigTester(self, config_class=NllbMoeConfig) self.config_tester = ConfigTester(self, config_class=NllbMoeConfig)

View File

@ -230,6 +230,8 @@ class SplinterModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
): ):
if pipeline_test_casse_name == "QAPipelineTests": if pipeline_test_casse_name == "QAPipelineTests":
return True return True
elif pipeline_test_casse_name == "FeatureExtractionPipelineTests" and tokenizer_name.endswith("Fast"):
return True
return False return False

View File

@ -93,7 +93,14 @@ for task, task_info in pipeline_test_mapping.items():
} }
TINY_MODEL_SUMMARY_FILE_PATH = os.path.join(Path(__file__).parent.parent, "tests/utils/tiny_model_summary.json") # The default value `hf-internal-testing` is for running the pipeline testing against the tiny models on the Hub.
# For debugging purpose, we can specify a local path which is the `output_path` argument of a previous run of
# `utils/create_dummy_models.py`.
TRANSFORMERS_TINY_MODEL_PATH = os.environ.get("TRANSFORMERS_TINY_MODEL_PATH", "hf-internal-testing")
if TRANSFORMERS_TINY_MODEL_PATH == "hf-internal-testing":
TINY_MODEL_SUMMARY_FILE_PATH = os.path.join(Path(__file__).parent.parent, "tests/utils/tiny_model_summary.json")
else:
TINY_MODEL_SUMMARY_FILE_PATH = os.path.join(TRANSFORMERS_TINY_MODEL_PATH, "reports", "tiny_model_summary.json")
with open(TINY_MODEL_SUMMARY_FILE_PATH) as fp: with open(TINY_MODEL_SUMMARY_FILE_PATH) as fp:
tiny_model_summary = json.load(fp) tiny_model_summary = json.load(fp)
@ -146,12 +153,15 @@ class PipelineTesterMixin:
if model_arch_name in tiny_model_summary: if model_arch_name in tiny_model_summary:
tokenizer_names = tiny_model_summary[model_arch_name]["tokenizer_classes"] tokenizer_names = tiny_model_summary[model_arch_name]["tokenizer_classes"]
processor_names = tiny_model_summary[model_arch_name]["processor_classes"] processor_names = tiny_model_summary[model_arch_name]["processor_classes"]
if "sha" in tiny_model_summary[model_arch_name]:
commit = tiny_model_summary[model_arch_name]["sha"] commit = tiny_model_summary[model_arch_name]["sha"]
# Adding `None` (if empty) so we can generate tests # Adding `None` (if empty) so we can generate tests
tokenizer_names = [None] if len(tokenizer_names) == 0 else tokenizer_names tokenizer_names = [None] if len(tokenizer_names) == 0 else tokenizer_names
processor_names = [None] if len(processor_names) == 0 else processor_names processor_names = [None] if len(processor_names) == 0 else processor_names
repo_name = f"tiny-random-{model_arch_name}" repo_name = f"tiny-random-{model_arch_name}"
if TRANSFORMERS_TINY_MODEL_PATH != "hf-internal-testing":
repo_name = model_arch_name
self.run_model_pipeline_tests( self.run_model_pipeline_tests(
task, repo_name, model_architecture, tokenizer_names, processor_names, commit task, repo_name, model_architecture, tokenizer_names, processor_names, commit
@ -210,7 +220,10 @@ class PipelineTesterMixin:
processor_name (`str`): processor_name (`str`):
The name of a subclass of `BaseImageProcessor` or `FeatureExtractionMixin`. The name of a subclass of `BaseImageProcessor` or `FeatureExtractionMixin`.
""" """
repo_id = f"hf-internal-testing/{repo_name}" repo_id = f"{TRANSFORMERS_TINY_MODEL_PATH}/{repo_name}"
if TRANSFORMERS_TINY_MODEL_PATH != "hf-internal-testing":
model_type = model_architecture.config_class.model_type
repo_id = os.path.join(TRANSFORMERS_TINY_MODEL_PATH, model_type, repo_name)
tokenizer = None tokenizer = None
if tokenizer_name is not None: if tokenizer_name is not None:

View File

@ -18,6 +18,7 @@ import collections.abc
import copy import copy
import inspect import inspect
import json import json
import multiprocessing
import os import os
import shutil import shutil
import tempfile import tempfile
@ -679,12 +680,22 @@ def convert_processors(processors, tiny_config, output_folder, result):
if hasattr(tiny_config, "max_position_embeddings") and tiny_config.max_position_embeddings > 0: if hasattr(tiny_config, "max_position_embeddings") and tiny_config.max_position_embeddings > 0:
if fast_tokenizer is not None: if fast_tokenizer is not None:
if fast_tokenizer.__class__.__name__ in ["RobertaTokenizerFast", "XLMRobertaTokenizerFast"]: if fast_tokenizer.__class__.__name__ in [
"RobertaTokenizerFast",
"XLMRobertaTokenizerFast",
"LongformerTokenizerFast",
"MPNetTokenizerFast",
]:
fast_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2 fast_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2
else: else:
fast_tokenizer.model_max_length = tiny_config.max_position_embeddings fast_tokenizer.model_max_length = tiny_config.max_position_embeddings
if slow_tokenizer is not None: if slow_tokenizer is not None:
if slow_tokenizer.__class__.__name__ in ["RobertaTokenizer", "XLMRobertaTokenizer"]: if slow_tokenizer.__class__.__name__ in [
"RobertaTokenizer",
"XLMRobertaTokenizer",
"LongformerTokenizer",
"MPNetTokenizer",
]:
slow_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2 slow_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2
else: else:
slow_tokenizer.model_max_length = tiny_config.max_position_embeddings slow_tokenizer.model_max_length = tiny_config.max_position_embeddings
@ -1047,6 +1058,10 @@ def build(config_class, models_to_create, output_dir):
The directory to save all the checkpoints. Each model architecture will be saved in a subdirectory under The directory to save all the checkpoints. Each model architecture will be saved in a subdirectory under
it. Models in different frameworks with the same architecture will be saved in the same subdirectory. it. Models in different frameworks with the same architecture will be saved in the same subdirectory.
""" """
if data["training_ds"] is None or data["testing_ds"] is None:
ds = load_dataset("wikitext", "wikitext-2-raw-v1")
data["training_ds"] = ds["train"]
data["testing_ds"] = ds["test"]
if config_class.model_type in [ if config_class.model_type in [
"encoder-decoder", "encoder-decoder",
@ -1323,6 +1338,7 @@ def create_tiny_models(
upload, upload,
organization, organization,
token, token,
num_workers=1,
): ):
clone_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) clone_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
if os.getcwd() != clone_path: if os.getcwd() != clone_path:
@ -1343,10 +1359,6 @@ def create_tiny_models(
pytorch_arch_mappings = [getattr(transformers_module, x) for x in _pytorch_arch_mappings] pytorch_arch_mappings = [getattr(transformers_module, x) for x in _pytorch_arch_mappings]
tensorflow_arch_mappings = [getattr(transformers_module, x) for x in _tensorflow_arch_mappings] tensorflow_arch_mappings = [getattr(transformers_module, x) for x in _tensorflow_arch_mappings]
ds = load_dataset("wikitext", "wikitext-2-raw-v1")
data["training_ds"] = ds["train"]
data["testing_ds"] = ds["test"]
config_classes = CONFIG_MAPPING.values() config_classes = CONFIG_MAPPING.values()
if not all: if not all:
config_classes = [CONFIG_MAPPING[model_type] for model_type in model_types] config_classes = [CONFIG_MAPPING[model_type] for model_type in model_types]
@ -1363,11 +1375,19 @@ def create_tiny_models(
to_create[c] = {"processor": processors, "pytorch": models, "tensorflow": tf_models} to_create[c] = {"processor": processors, "pytorch": models, "tensorflow": tf_models}
results = {} results = {}
if num_workers <= 1:
for c, models_to_create in list(to_create.items()): for c, models_to_create in list(to_create.items()):
print(f"Create models for {c.__name__} ...") print(f"Create models for {c.__name__} ...")
result = build(c, models_to_create, output_dir=os.path.join(output_path, c.model_type)) result = build(c, models_to_create, output_dir=os.path.join(output_path, c.model_type))
results[c.__name__] = result results[c.__name__] = result
print("=" * 40) print("=" * 40)
else:
all_build_args = []
for c, models_to_create in list(to_create.items()):
all_build_args.append((c, models_to_create, os.path.join(output_path, c.model_type)))
with multiprocessing.Pool() as pool:
results = pool.starmap(build, all_build_args)
results = {buid_args[0].__name__: result for buid_args, result in zip(all_build_args, results)}
if upload: if upload:
if organization is None: if organization is None:
@ -1426,9 +1446,8 @@ def create_tiny_models(
if __name__ == "__main__": if __name__ == "__main__":
ds = load_dataset("wikitext", "wikitext-2-raw-v1") # This has to be `spawn` to avoid hanging forever!
training_ds = ds["train"] multiprocessing.set_start_method("spawn")
testing_ds = ds["test"]
def list_str(values): def list_str(values):
return values.split(",") return values.split(",")
@ -1465,6 +1484,7 @@ if __name__ == "__main__":
"--token", default=None, type=str, help="A valid authentication token for HuggingFace Hub with write access." "--token", default=None, type=str, help="A valid authentication token for HuggingFace Hub with write access."
) )
parser.add_argument("output_path", type=Path, help="Path indicating where to store generated model.") parser.add_argument("output_path", type=Path, help="Path indicating where to store generated model.")
parser.add_argument("--num_workers", default=1, type=int, help="The number of workers to run.")
args = parser.parse_args() args = parser.parse_args()
@ -1480,4 +1500,5 @@ if __name__ == "__main__":
args.upload, args.upload,
args.organization, args.organization,
args.token, args.token,
args.num_workers,
) )

View File

@ -21,8 +21,10 @@ version of `tests/utils/tiny_model_summary.json`. That updated file should be me
""" """
import argparse
import copy import copy
import json import json
import multiprocessing
import os import os
import time import time
@ -197,6 +199,13 @@ def update_tiny_model_summary_file(report_path):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--num_workers", default=1, type=int, help="The number of workers to run.")
args = parser.parse_args()
# This has to be `spawn` to avoid hanging forever!
multiprocessing.set_start_method("spawn")
output_path = "tiny_models" output_path = "tiny_models"
all = True all = True
model_types = None model_types = None
@ -214,6 +223,7 @@ if __name__ == "__main__":
upload, upload,
organization, organization,
token=os.environ.get("TOKEN", None), token=os.environ.get("TOKEN", None),
num_workers=args.num_workers,
) )
update_tiny_model_summary_file(report_path=os.path.join(output_path, "reports")) update_tiny_model_summary_file(report_path=os.path.join(output_path, "reports"))