Test fetch v2 (#22367)

* Test fetcher v2

* Fix regexes

* Remove sanity check

* Fake modification to OPT

* Fixes some .sep issues

* Remove fake OPT change

* Fake modif for BERT

* Fake modif for init

* Exclude SageMaker tests

* Fix test and remove fake modif

* Fake setup modif

* Fake pipeline modif

* Remove all fake modifs

* Adds options to skip/force tests

* [test-all-models] Fake modif for BERT

* Try this way

* Does the command actually work?

* [test-all-models] Try again!

* [skip circleci] Remove fake modif

* Remove debug statements

* Add the list of important models

* Quality

* Update utils/tests_fetcher.py

Co-authored-by: Lysandre Debut <lysandre.debut@reseau.eseo.fr>

* Address review comments

* Address review comments

* Fix and add test

* Apply suggestions from code review

Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>

* Address review comments

---------

Co-authored-by: Lysandre Debut <lysandre.debut@reseau.eseo.fr>
Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
This commit is contained in:
Sylvain Gugger 2023-03-31 16:18:43 -04:00 committed by GitHub
parent 3a9464bd30
commit c612628045
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 1005 additions and 412 deletions

View File

@ -176,7 +176,6 @@ jobs:
- run: python utils/check_config_attributes.py
- run: python utils/check_doctest_list.py
- run: make deps_table_check_updated
- run: python utils/tests_fetcher.py --sanity_check
- run: python utils/update_metadata.py --check-only
- run: python utils/check_task_guides.py

View File

@ -41,7 +41,6 @@ repo-consistency:
python utils/check_config_docstrings.py
python utils/check_config_attributes.py
python utils/check_doctest_list.py
python utils/tests_fetcher.py --sanity_check
python utils/update_metadata.py --check-only
python utils/check_task_guides.py

View File

@ -13,52 +13,661 @@
# limitations under the License.
import os
import shutil
import sys
import tempfile
import unittest
from contextlib import contextmanager
from pathlib import Path
from git import Repo
git_repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
sys.path.append(os.path.join(git_repo_path, "utils"))
transformers_path = os.path.join(git_repo_path, "src", "transformers")
# Tests are run against this specific commit for reproducibility
# https://github.com/huggingface/transformers/tree/07f6690206e39ed7a4d9dbc58824314f7089bb38
GIT_TEST_SHA = "07f6690206e39ed7a4d9dbc58824314f7089bb38"
from tests_fetcher import checkout_commit, clean_code, get_module_dependencies # noqa: E402
from transformers.testing_utils import CaptureStdout
class CheckDummiesTester(unittest.TestCase):
REPO_PATH = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
sys.path.append(os.path.join(REPO_PATH, "utils"))
import tests_fetcher # noqa: E402
from tests_fetcher import ( # noqa: E402
checkout_commit,
clean_code,
create_module_to_test_map,
create_reverse_dependency_map,
create_reverse_dependency_tree,
diff_is_docstring_only,
extract_imports,
get_all_tests,
get_diff,
get_module_dependencies,
get_tree_starting_at,
infer_tests_to_run,
parse_commit_message,
print_tree_deps_of,
)
BERT_MODELING_FILE = "src/transformers/models/bert/modeling_bert.py"
BERT_MODEL_FILE = """from ...modeling_utils import PreTrainedModel
from ...utils import is_torch_available
from .configuration_bert import BertConfig
class BertModel:
'''
This is the docstring.
'''
This is the code
"""
BERT_MODEL_FILE_NEW_DOCSTRING = """from ...modeling_utils import PreTrainedModel
from ...utils import is_torch_available
from .configuration_bert import BertConfig
class BertModel:
'''
This is the docstring. It has been updated.
'''
This is the code
"""
BERT_MODEL_FILE_NEW_CODE = """from ...modeling_utils import PreTrainedModel
from ...utils import is_torch_available
from .configuration_bert import BertConfig
class BertModel:
'''
This is the docstring.
'''
This is the code. It has been updated
"""
def create_tmp_repo(tmp_dir, models=None):
"""
Creates a repository in a temporary directory mimicking the structure of Transformers. Uses the list of models
provided (which defaults to just `["bert"]`).
"""
tmp_dir = Path(tmp_dir)
if tmp_dir.exists():
shutil.rmtree(tmp_dir)
tmp_dir.mkdir(exist_ok=True)
repo = Repo.init(tmp_dir)
if models is None:
models = ["bert"]
class_names = [model[0].upper() + model[1:] for model in models]
transformers_dir = tmp_dir / "src" / "transformers"
transformers_dir.mkdir(parents=True, exist_ok=True)
with open(transformers_dir / "__init__.py", "w") as f:
init_lines = ["from .utils import cached_file, is_torch_available"]
init_lines.extend(
[f"from .models.{model} import {cls}Config, {cls}Model" for model, cls in zip(models, class_names)]
)
f.write("\n".join(init_lines) + "\n")
with open(transformers_dir / "configuration_utils.py", "w") as f:
f.write("from .utils import cached_file\n\ncode")
with open(transformers_dir / "modeling_utils.py", "w") as f:
f.write("from .utils import cached_file\n\ncode")
utils_dir = tmp_dir / "src" / "transformers" / "utils"
utils_dir.mkdir(exist_ok=True)
with open(utils_dir / "__init__.py", "w") as f:
f.write("from .hub import cached_file\nfrom .imports import is_torch_available\n")
with open(utils_dir / "hub.py", "w") as f:
f.write("import huggingface_hub\n\ncode")
with open(utils_dir / "imports.py", "w") as f:
f.write("code")
model_dir = tmp_dir / "src" / "transformers" / "models"
model_dir.mkdir(parents=True, exist_ok=True)
with open(model_dir / "__init__.py", "w") as f:
f.write("\n".join([f"import {model}" for model in models]))
for model, cls in zip(models, class_names):
model_dir = tmp_dir / "src" / "transformers" / "models" / model
model_dir.mkdir(parents=True, exist_ok=True)
with open(model_dir / "__init__.py", "w") as f:
f.write(f"from .configuration_{model} import {cls}Config\nfrom .modeling_{model} import {cls}Model\n")
with open(model_dir / f"configuration_{model}.py", "w") as f:
f.write("from ...configuration_utils import PretrainedConfig\ncode")
with open(model_dir / f"modeling_{model}.py", "w") as f:
modeling_code = BERT_MODEL_FILE.replace("bert", model).replace("Bert", cls)
f.write(modeling_code)
test_dir = tmp_dir / "tests"
test_dir.mkdir(exist_ok=True)
with open(test_dir / "test_modeling_common.py", "w") as f:
f.write("from transformers.modeling_utils import PreTrainedModel\ncode")
for model, cls in zip(models, class_names):
test_model_dir = test_dir / "models" / model
test_model_dir.mkdir(parents=True, exist_ok=True)
(test_model_dir / "__init__.py").touch()
with open(test_model_dir / f"test_modeling_{model}.py", "w") as f:
f.write(
f"from transformers import {cls}Config, {cls}Model\nfrom ...test_modeling_common import ModelTesterMixin\n\ncode"
)
repo.index.add(["src", "tests"])
repo.index.commit("Initial commit")
repo.create_head("main")
repo.head.reference = repo.refs.main
repo.delete_head("master")
return repo
@contextmanager
def patch_transformer_repo_path(new_folder):
"""
Temporarily patches the variables defines in `tests_fetcher` to use a different location for the repo.
"""
old_repo_path = tests_fetcher.PATH_TO_REPO
tests_fetcher.PATH_TO_REPO = Path(new_folder).resolve()
tests_fetcher.PATH_TO_TRANFORMERS = tests_fetcher.PATH_TO_REPO / "src/transformers"
tests_fetcher.PATH_TO_TESTS = tests_fetcher.PATH_TO_REPO / "tests"
try:
yield
finally:
tests_fetcher.PATH_TO_REPO = old_repo_path
tests_fetcher.PATH_TO_TRANFORMERS = tests_fetcher.PATH_TO_REPO / "src/transformers"
tests_fetcher.PATH_TO_TESTS = tests_fetcher.PATH_TO_REPO / "tests"
def commit_changes(filenames, contents, repo, commit_message="Commit"):
"""
Commit new `contents` to `filenames` inside a given `repo`.
"""
if not isinstance(filenames, list):
filenames = [filenames]
if not isinstance(contents, list):
contents = [contents]
folder = Path(repo.working_dir)
for filename, content in zip(filenames, contents):
with open(folder / filename, "w") as f:
f.write(content)
repo.index.add(filenames)
commit = repo.index.commit(commit_message)
return commit.hexsha
class TestFetcherTester(unittest.TestCase):
def test_checkout_commit(self):
with tempfile.TemporaryDirectory() as tmp_folder:
tmp_folder = Path(tmp_folder)
repo = create_tmp_repo(tmp_folder)
initial_sha = repo.head.commit.hexsha
new_sha = commit_changes(BERT_MODELING_FILE, BERT_MODEL_FILE_NEW_DOCSTRING, repo)
assert repo.head.commit.hexsha == new_sha
with checkout_commit(repo, initial_sha):
assert repo.head.commit.hexsha == initial_sha
with open(tmp_folder / BERT_MODELING_FILE) as f:
assert f.read() == BERT_MODEL_FILE
assert repo.head.commit.hexsha == new_sha
with open(tmp_folder / BERT_MODELING_FILE) as f:
assert f.read() == BERT_MODEL_FILE_NEW_DOCSTRING
def test_clean_code(self):
# Clean code removes all strings in triple quotes
self.assertEqual(clean_code('"""\nDocstring\n"""\ncode\n"""Long string"""\ncode\n'), "code\ncode")
self.assertEqual(clean_code("'''\nDocstring\n'''\ncode\n'''Long string'''\ncode\n'''"), "code\ncode")
assert clean_code('"""\nDocstring\n"""\ncode\n"""Long string"""\ncode\n') == "code\ncode"
assert clean_code("'''\nDocstring\n'''\ncode\n'''Long string'''\ncode\n'''") == "code\ncode"
# Clean code removes all comments
self.assertEqual(clean_code("code\n# Comment\ncode"), "code\ncode")
self.assertEqual(clean_code("code # inline comment\ncode"), "code \ncode")
assert clean_code("code\n# Comment\ncode") == "code\ncode"
assert clean_code("code # inline comment\ncode") == "code \ncode"
def test_checkout_commit(self):
repo = Repo(git_repo_path)
self.assertNotEqual(repo.head.commit.hexsha, GIT_TEST_SHA)
with checkout_commit(repo, GIT_TEST_SHA):
self.assertEqual(repo.head.commit.hexsha, GIT_TEST_SHA)
self.assertNotEqual(repo.head.commit.hexsha, GIT_TEST_SHA)
def test_get_all_tests(self):
with tempfile.TemporaryDirectory() as tmp_folder:
tmp_folder = Path(tmp_folder)
create_tmp_repo(tmp_folder)
with patch_transformer_repo_path(tmp_folder):
assert get_all_tests() == ["tests/models/bert", "tests/test_modeling_common.py"]
def test_get_all_tests_on_full_repo(self):
all_tests = get_all_tests()
assert "tests/models/albert" in all_tests
assert "tests/models/bert" in all_tests
assert "tests/repo_utils" in all_tests
assert "tests/test_pipeline_mixin.py" in all_tests
assert "tests/models" not in all_tests
assert "tests/__pycache__" not in all_tests
assert "tests/models/albert/test_modeling_albert.py" not in all_tests
assert "tests/repo_utils/test_tests_fetcher.py" not in all_tests
def test_diff_is_docstring_only(self):
with tempfile.TemporaryDirectory() as tmp_folder:
tmp_folder = Path(tmp_folder)
repo = create_tmp_repo(tmp_folder)
branching_point = repo.refs.main.commit
bert_file = BERT_MODELING_FILE
commit_changes(bert_file, BERT_MODEL_FILE_NEW_DOCSTRING, repo)
assert diff_is_docstring_only(repo, branching_point, bert_file)
commit_changes(bert_file, BERT_MODEL_FILE_NEW_CODE, repo)
assert not diff_is_docstring_only(repo, branching_point, bert_file)
def test_get_diff(self):
with tempfile.TemporaryDirectory() as tmp_folder:
tmp_folder = Path(tmp_folder)
repo = create_tmp_repo(tmp_folder)
initial_commit = repo.refs.main.commit
bert_file = BERT_MODELING_FILE
commit_changes(bert_file, BERT_MODEL_FILE_NEW_DOCSTRING, repo)
assert get_diff(repo, repo.head.commit, repo.head.commit.parents) == []
commit_changes(bert_file, BERT_MODEL_FILE_NEW_DOCSTRING + "\n# Adding a comment\n", repo)
assert get_diff(repo, repo.head.commit, repo.head.commit.parents) == []
commit_changes(bert_file, BERT_MODEL_FILE_NEW_CODE, repo)
assert get_diff(repo, repo.head.commit, repo.head.commit.parents) == [
"src/transformers/models/bert/modeling_bert.py"
]
commit_changes("src/transformers/utils/hub.py", "import huggingface_hub\n\nnew code", repo)
assert get_diff(repo, repo.head.commit, repo.head.commit.parents) == ["src/transformers/utils/hub.py"]
assert get_diff(repo, repo.head.commit, [initial_commit]) == [
"src/transformers/models/bert/modeling_bert.py",
"src/transformers/utils/hub.py",
]
def test_extract_imports_relative(self):
with tempfile.TemporaryDirectory() as tmp_folder:
tmp_folder = Path(tmp_folder)
create_tmp_repo(tmp_folder)
expected_bert_imports = [
("src/transformers/modeling_utils.py", ["PreTrainedModel"]),
("src/transformers/utils/__init__.py", ["is_torch_available"]),
("src/transformers/models/bert/configuration_bert.py", ["BertConfig"]),
]
expected_utils_imports = [
("src/transformers/utils/hub.py", ["cached_file"]),
("src/transformers/utils/imports.py", ["is_torch_available"]),
]
with patch_transformer_repo_path(tmp_folder):
assert extract_imports(BERT_MODELING_FILE) == expected_bert_imports
assert extract_imports("src/transformers/utils/__init__.py") == expected_utils_imports
with open(tmp_folder / BERT_MODELING_FILE, "w") as f:
f.write(
"from ...utils import cached_file, is_torch_available\nfrom .configuration_bert import BertConfig\n"
)
expected_bert_imports = [
("src/transformers/utils/__init__.py", ["cached_file", "is_torch_available"]),
("src/transformers/models/bert/configuration_bert.py", ["BertConfig"]),
]
with patch_transformer_repo_path(tmp_folder):
assert extract_imports(BERT_MODELING_FILE) == expected_bert_imports
# Test with multi-line imports
with open(tmp_folder / BERT_MODELING_FILE, "w") as f:
f.write(
"from ...utils import (\n cached_file,\n is_torch_available\n)\nfrom .configuration_bert import BertConfig\n"
)
expected_bert_imports = [
("src/transformers/models/bert/configuration_bert.py", ["BertConfig"]),
("src/transformers/utils/__init__.py", ["cached_file", "is_torch_available"]),
]
with patch_transformer_repo_path(tmp_folder):
assert extract_imports(BERT_MODELING_FILE) == expected_bert_imports
def test_extract_imports_absolute(self):
with tempfile.TemporaryDirectory() as tmp_folder:
tmp_folder = Path(tmp_folder)
create_tmp_repo(tmp_folder)
with open(tmp_folder / BERT_MODELING_FILE, "w") as f:
f.write(
"from transformers.utils import cached_file, is_torch_available\nfrom transformers.models.bert.configuration_bert import BertConfig\n"
)
expected_bert_imports = [
("src/transformers/utils/__init__.py", ["cached_file", "is_torch_available"]),
("src/transformers/models/bert/configuration_bert.py", ["BertConfig"]),
]
with patch_transformer_repo_path(tmp_folder):
assert extract_imports(BERT_MODELING_FILE) == expected_bert_imports
# Test with multi-line imports
with open(tmp_folder / BERT_MODELING_FILE, "w") as f:
f.write(
"from transformers.utils import (\n cached_file,\n is_torch_available\n)\nfrom transformers.models.bert.configuration_bert import BertConfig\n"
)
expected_bert_imports = [
("src/transformers/models/bert/configuration_bert.py", ["BertConfig"]),
("src/transformers/utils/__init__.py", ["cached_file", "is_torch_available"]),
]
with patch_transformer_repo_path(tmp_folder):
assert extract_imports(BERT_MODELING_FILE) == expected_bert_imports
# Test with base imports
with open(tmp_folder / BERT_MODELING_FILE, "w") as f:
f.write(
"from transformers.utils import (\n cached_file,\n is_torch_available\n)\nfrom transformers import BertConfig\n"
)
expected_bert_imports = [
("src/transformers/__init__.py", ["BertConfig"]),
("src/transformers/utils/__init__.py", ["cached_file", "is_torch_available"]),
]
with patch_transformer_repo_path(tmp_folder):
assert extract_imports(BERT_MODELING_FILE) == expected_bert_imports
def test_get_module_dependencies(self):
bert_module = os.path.join(transformers_path, "models", "bert", "modeling_bert.py")
expected_deps = [
"activations.py",
"modeling_outputs.py",
"modeling_utils.py",
"pytorch_utils.py",
"models/bert/configuration_bert.py",
with tempfile.TemporaryDirectory() as tmp_folder:
tmp_folder = Path(tmp_folder)
create_tmp_repo(tmp_folder)
expected_bert_dependencies = [
"src/transformers/modeling_utils.py",
"src/transformers/models/bert/configuration_bert.py",
"src/transformers/utils/imports.py",
]
expected_deps = {os.path.join(transformers_path, f) for f in expected_deps}
repo = Repo(git_repo_path)
with checkout_commit(repo, GIT_TEST_SHA):
deps = get_module_dependencies(bert_module)
deps = {os.path.expanduser(f) for f in deps}
self.assertEqual(deps, expected_deps)
with patch_transformer_repo_path(tmp_folder):
assert get_module_dependencies(BERT_MODELING_FILE) == expected_bert_dependencies
expected_test_bert_dependencies = [
"tests/test_modeling_common.py",
"src/transformers/models/bert/configuration_bert.py",
"src/transformers/models/bert/modeling_bert.py",
]
with patch_transformer_repo_path(tmp_folder):
assert (
get_module_dependencies("tests/models/bert/test_modeling_bert.py")
== expected_test_bert_dependencies
)
# Test with a submodule
(tmp_folder / "src/transformers/utils/logging.py").touch()
with open(tmp_folder / BERT_MODELING_FILE, "a") as f:
f.write("from ...utils import logging\n")
expected_bert_dependencies = [
"src/transformers/modeling_utils.py",
"src/transformers/models/bert/configuration_bert.py",
"src/transformers/utils/logging.py",
"src/transformers/utils/imports.py",
]
with patch_transformer_repo_path(tmp_folder):
assert get_module_dependencies(BERT_MODELING_FILE) == expected_bert_dependencies
# Test with an object non-imported in the init
create_tmp_repo(tmp_folder)
with open(tmp_folder / BERT_MODELING_FILE, "a") as f:
f.write("from ...utils import CONSTANT\n")
expected_bert_dependencies = [
"src/transformers/modeling_utils.py",
"src/transformers/models/bert/configuration_bert.py",
"src/transformers/utils/__init__.py",
"src/transformers/utils/imports.py",
]
with patch_transformer_repo_path(tmp_folder):
assert get_module_dependencies(BERT_MODELING_FILE) == expected_bert_dependencies
def test_create_reverse_dependency_tree(self):
with tempfile.TemporaryDirectory() as tmp_folder:
tmp_folder = Path(tmp_folder)
create_tmp_repo(tmp_folder)
with patch_transformer_repo_path(tmp_folder):
tree = create_reverse_dependency_tree()
init_edges = [
"src/transformers/utils/hub.py",
"src/transformers/utils/imports.py",
"src/transformers/models/bert/configuration_bert.py",
"src/transformers/models/bert/modeling_bert.py",
]
assert {f for f, g in tree if g == "src/transformers/__init__.py"} == set(init_edges)
bert_edges = [
"src/transformers/modeling_utils.py",
"src/transformers/utils/imports.py",
"src/transformers/models/bert/configuration_bert.py",
]
assert {f for f, g in tree if g == "src/transformers/models/bert/modeling_bert.py"} == set(bert_edges)
test_bert_edges = [
"tests/test_modeling_common.py",
"src/transformers/models/bert/configuration_bert.py",
"src/transformers/models/bert/modeling_bert.py",
]
assert {f for f, g in tree if g == "tests/models/bert/test_modeling_bert.py"} == set(test_bert_edges)
def test_get_tree_starting_at(self):
with tempfile.TemporaryDirectory() as tmp_folder:
tmp_folder = Path(tmp_folder)
create_tmp_repo(tmp_folder)
with patch_transformer_repo_path(tmp_folder):
edges = create_reverse_dependency_tree()
bert_tree = get_tree_starting_at("src/transformers/models/bert/modeling_bert.py", edges)
config_utils_tree = get_tree_starting_at("src/transformers/configuration_utils.py", edges)
expected_bert_tree = [
"src/transformers/models/bert/modeling_bert.py",
[("src/transformers/models/bert/modeling_bert.py", "tests/models/bert/test_modeling_bert.py")],
]
assert bert_tree == expected_bert_tree
expected_config_tree = [
"src/transformers/configuration_utils.py",
[("src/transformers/configuration_utils.py", "src/transformers/models/bert/configuration_bert.py")],
[
("src/transformers/models/bert/configuration_bert.py", "tests/models/bert/test_modeling_bert.py"),
(
"src/transformers/models/bert/configuration_bert.py",
"src/transformers/models/bert/modeling_bert.py",
),
],
]
# Order of the edges is random
assert [set(v) for v in config_utils_tree] == [set(v) for v in expected_config_tree]
def test_print_tree_deps_of(self):
with tempfile.TemporaryDirectory() as tmp_folder:
tmp_folder = Path(tmp_folder)
create_tmp_repo(tmp_folder)
# There are two possible outputs since the order of the last two lines is non-deterministic.
expected_std_out = """src/transformers/models/bert/modeling_bert.py
tests/models/bert/test_modeling_bert.py
src/transformers/configuration_utils.py
src/transformers/models/bert/configuration_bert.py
src/transformers/models/bert/modeling_bert.py
tests/models/bert/test_modeling_bert.py"""
expected_std_out_2 = """src/transformers/models/bert/modeling_bert.py
tests/models/bert/test_modeling_bert.py
src/transformers/configuration_utils.py
src/transformers/models/bert/configuration_bert.py
tests/models/bert/test_modeling_bert.py
src/transformers/models/bert/modeling_bert.py"""
with patch_transformer_repo_path(tmp_folder), CaptureStdout() as cs:
print_tree_deps_of("src/transformers/models/bert/modeling_bert.py")
print_tree_deps_of("src/transformers/configuration_utils.py")
assert cs.out.strip() in [expected_std_out, expected_std_out_2]
def test_create_reverse_dependency_map(self):
with tempfile.TemporaryDirectory() as tmp_folder:
tmp_folder = Path(tmp_folder)
create_tmp_repo(tmp_folder)
with patch_transformer_repo_path(tmp_folder):
reverse_map = create_reverse_dependency_map()
# impact of BERT modeling file (note that we stop at the inits and don't go down further)
expected_bert_deps = {
"src/transformers/__init__.py",
"src/transformers/models/bert/__init__.py",
"tests/models/bert/test_modeling_bert.py",
}
assert set(reverse_map["src/transformers/models/bert/modeling_bert.py"]) == expected_bert_deps
# init gets the direct deps (and their recursive deps)
expected_init_deps = {
"src/transformers/utils/__init__.py",
"src/transformers/utils/hub.py",
"src/transformers/utils/imports.py",
"src/transformers/models/bert/__init__.py",
"src/transformers/models/bert/configuration_bert.py",
"src/transformers/models/bert/modeling_bert.py",
"src/transformers/configuration_utils.py",
"src/transformers/modeling_utils.py",
"tests/test_modeling_common.py",
"tests/models/bert/test_modeling_bert.py",
}
assert set(reverse_map["src/transformers/__init__.py"]) == expected_init_deps
expected_init_deps = {
"src/transformers/__init__.py",
"src/transformers/models/bert/configuration_bert.py",
"src/transformers/models/bert/modeling_bert.py",
"tests/models/bert/test_modeling_bert.py",
}
assert set(reverse_map["src/transformers/models/bert/__init__.py"]) == expected_init_deps
# Test that with more models init of bert only gets deps to bert.
create_tmp_repo(tmp_folder, models=["bert", "gpt2"])
with patch_transformer_repo_path(tmp_folder):
reverse_map = create_reverse_dependency_map()
# init gets the direct deps (and their recursive deps)
expected_init_deps = {
"src/transformers/__init__.py",
"src/transformers/models/bert/configuration_bert.py",
"src/transformers/models/bert/modeling_bert.py",
"tests/models/bert/test_modeling_bert.py",
}
assert set(reverse_map["src/transformers/models/bert/__init__.py"]) == expected_init_deps
def test_create_module_to_test_map(self):
with tempfile.TemporaryDirectory() as tmp_folder:
tmp_folder = Path(tmp_folder)
models = models = ["bert", "gpt2"] + [f"bert{i}" for i in range(10)]
create_tmp_repo(tmp_folder, models=models)
with patch_transformer_repo_path(tmp_folder):
test_map = create_module_to_test_map(filter_models=True)
for model in models:
assert test_map[f"src/transformers/models/{model}/modeling_{model}.py"] == [
f"tests/models/{model}/test_modeling_{model}.py"
]
# Init got filtered
expected_init_tests = {
"tests/test_modeling_common.py",
"tests/models/bert/test_modeling_bert.py",
"tests/models/gpt2/test_modeling_gpt2.py",
}
assert set(test_map["src/transformers/__init__.py"]) == expected_init_tests
def test_infer_tests_to_run(self):
with tempfile.TemporaryDirectory() as tmp_folder:
tmp_folder = Path(tmp_folder)
models = ["bert", "gpt2"] + [f"bert{i}" for i in range(10)]
repo = create_tmp_repo(tmp_folder, models=models)
commit_changes("src/transformers/models/bert/modeling_bert.py", BERT_MODEL_FILE_NEW_CODE, repo)
with patch_transformer_repo_path(tmp_folder):
infer_tests_to_run(tmp_folder / "test-output.txt", diff_with_last_commit=True)
with open(tmp_folder / "test-output.txt", "r") as f:
tests_to_run = f.read()
assert tests_to_run == "tests/models/bert/test_modeling_bert.py"
# Fake a new model addition
repo = create_tmp_repo(tmp_folder, models=models)
branch = repo.create_head("new_model")
branch.checkout()
with open(tmp_folder / "src/transformers/__init__.py", "a") as f:
f.write("from .models.t5 import T5Config, T5Model\n")
model_dir = tmp_folder / "src/transformers/models/t5"
model_dir.mkdir(exist_ok=True)
with open(model_dir / "__init__.py", "w") as f:
f.write("from .configuration_t5 import T5Config\nfrom .modeling_t5 import T5Model\n")
with open(model_dir / "configuration_t5.py", "w") as f:
f.write("from ...configuration_utils import PretrainedConfig\ncode")
with open(model_dir / "modeling_t5.py", "w") as f:
modeling_code = BERT_MODEL_FILE.replace("bert", "t5").replace("Bert", "T5")
f.write(modeling_code)
test_dir = tmp_folder / "tests/models/t5"
test_dir.mkdir(exist_ok=True)
(test_dir / "__init__.py").touch()
with open(test_dir / "test_modeling_t5.py", "w") as f:
f.write(
"from transformers import T5Config, T5Model\nfrom ...test_modeling_common import ModelTesterMixin\n\ncode"
)
repo.index.add(["src", "tests"])
repo.index.commit("Add T5 model")
with patch_transformer_repo_path(tmp_folder):
infer_tests_to_run(tmp_folder / "test-output.txt")
with open(tmp_folder / "test-output.txt", "r") as f:
tests_to_run = f.read()
expected_tests = {
"tests/models/bert/test_modeling_bert.py",
"tests/models/gpt2/test_modeling_gpt2.py",
"tests/models/t5/test_modeling_t5.py",
"tests/test_modeling_common.py",
}
assert set(tests_to_run.split(" ")) == expected_tests
with patch_transformer_repo_path(tmp_folder):
infer_tests_to_run(tmp_folder / "test-output.txt", filter_models=False)
with open(tmp_folder / "test-output.txt", "r") as f:
tests_to_run = f.read()
expected_tests = [f"tests/models/{name}/test_modeling_{name}.py" for name in models + ["t5"]]
expected_tests = set(expected_tests + ["tests/test_modeling_common.py"])
assert set(tests_to_run.split(" ")) == expected_tests
def test_infer_tests_to_run_with_test_modifs(self):
with tempfile.TemporaryDirectory() as tmp_folder:
tmp_folder = Path(tmp_folder)
models = ["bert", "gpt2"] + [f"bert{i}" for i in range(10)]
repo = create_tmp_repo(tmp_folder, models=models)
commit_changes(
"tests/models/bert/test_modeling_bert.py",
"from transformers import BertConfig, BertModel\nfrom ...test_modeling_common import ModelTesterMixin\n\ncode1",
repo,
)
with patch_transformer_repo_path(tmp_folder):
infer_tests_to_run(tmp_folder / "test-output.txt", diff_with_last_commit=True)
with open(tmp_folder / "test-output.txt", "r") as f:
tests_to_run = f.read()
assert tests_to_run == "tests/models/bert/test_modeling_bert.py"
def test_parse_commit_message(self):
assert parse_commit_message("Normal commit") == {"skip": False, "no_filter": False, "test_all": False}
assert parse_commit_message("[skip ci] commit") == {"skip": True, "no_filter": False, "test_all": False}
assert parse_commit_message("[ci skip] commit") == {"skip": True, "no_filter": False, "test_all": False}
assert parse_commit_message("[skip-ci] commit") == {"skip": True, "no_filter": False, "test_all": False}
assert parse_commit_message("[skip_ci] commit") == {"skip": True, "no_filter": False, "test_all": False}
assert parse_commit_message("[no filter] commit") == {"skip": False, "no_filter": True, "test_all": False}
assert parse_commit_message("[no-filter] commit") == {"skip": False, "no_filter": True, "test_all": False}
assert parse_commit_message("[no_filter] commit") == {"skip": False, "no_filter": True, "test_all": False}
assert parse_commit_message("[filter-no] commit") == {"skip": False, "no_filter": True, "test_all": False}
assert parse_commit_message("[test all] commit") == {"skip": False, "no_filter": False, "test_all": True}
assert parse_commit_message("[all test] commit") == {"skip": False, "no_filter": False, "test_all": True}
assert parse_commit_message("[test-all] commit") == {"skip": False, "no_filter": False, "test_all": True}
assert parse_commit_message("[all_test] commit") == {"skip": False, "no_filter": False, "test_all": True}

View File

@ -13,6 +13,27 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Welcome to tests_fetcher V2.
This util is designed to fetch tests to run on a PR so that only the tests impacted by the modifications are run, and
when too many models are being impacted, only run the tests of a subset of core models. It works like this.
Stage 1: Identify the modified files. This takes all the files from the branching point to the current commit (so
all modifications in a PR, not just the last commit) but excludes modifications that are on docstrings or comments
only.
Stage 2: Extract the tests to run. This is done by looking at the imports in each module and test file: if module A
imports module B, then changing module B impacts module A, so the tests using module A should be run. We thus get the
dependencies of each model and then recursively builds the 'reverse' map of dependencies to get all modules and tests
impacted by a given file. We then only keep the tests (and only the code models tests if there are too many modules).
Caveats:
- This module only filters tests by files (not individual tests) so it's better to have tests for different things
in different files.
- This module assumes inits are just importing things, not really building objects, so it's better to structure
them this way and move objects building in separate submodules.
"""
import argparse
import collections
import json
@ -24,13 +45,36 @@ from pathlib import Path
from git import Repo
# This script is intended to be run from the root of the repo but you can adapt this constant if you need to.
PATH_TO_TRANFORMERS = "."
PATH_TO_REPO = Path(__file__).parent.parent.resolve()
PATH_TO_TRANFORMERS = PATH_TO_REPO / "src/transformers"
PATH_TO_TESTS = PATH_TO_REPO / "tests"
# A temporary way to trigger all pipeline tests contained in model test files after PR #21516
all_model_test_files = [str(x) for x in Path("tests/models/").glob("**/**/test_modeling_*.py")]
all_pipeline_test_files = [str(x) for x in Path("tests/pipelines/").glob("**/test_pipelines_*.py")]
# List here the models to always test.
IMPORTANT_MODELS = [
# Most downloaded models
"bert",
"clip",
"t5",
"xlm-roberta",
"gpt2",
"bart",
"mpnet",
"gpt-j",
"wav2vec2",
"deberta-v2",
"layoutlm",
"opt",
"longformer",
"vit",
# Pipeline-specific model (to be sure each pipeline has one model in this list)
"tapas",
"vilt",
"clap",
"detr",
"owlvit",
"dpt",
"videomae",
]
@contextmanager
@ -79,17 +123,21 @@ def get_all_tests():
- folders under `tests/models`: `bert`, `gpt2`, etc.
- test files under `tests`: `test_modeling_common.py`, `test_tokenization_common.py`, etc.
"""
test_root_dir = os.path.join(PATH_TO_TRANFORMERS, "tests")
# test folders/files directly under `tests` folder
tests = os.listdir(test_root_dir)
tests = sorted(filter(lambda x: os.path.isdir(x) or x.startswith("tests/test_"), [f"tests/{x}" for x in tests]))
tests = os.listdir(PATH_TO_TESTS)
tests = [f"tests/{f}" for f in tests if "__pycache__" not in f]
tests = sorted([f for f in tests if (PATH_TO_REPO / f).is_dir() or f.startswith("tests/test_")])
# model specific test folders
model_tests_folders = os.listdir(os.path.join(test_root_dir, "models"))
model_test_folders = sorted(filter(os.path.isdir, [f"tests/models/{x}" for x in model_tests_folders]))
model_test_folders = os.listdir(PATH_TO_TESTS / "models")
model_test_folders = [f"tests/models/{f}" for f in model_test_folders if "__pycache__" not in f]
model_test_folders = sorted([f for f in model_test_folders if (PATH_TO_REPO / f).is_dir()])
tests.remove("tests/models")
# Sagemaker tests are not meant to be run on the CI.
if "tests/sagemaker" in tests:
tests.remove("tests/sagemaker")
tests = model_test_folders + tests
return tests
@ -99,11 +147,12 @@ def diff_is_docstring_only(repo, branching_point, filename):
"""
Check if the diff is only in docstrings in a filename.
"""
folder = Path(repo.working_dir)
with checkout_commit(repo, branching_point):
with open(filename, "r", encoding="utf-8") as f:
with open(folder / filename, "r", encoding="utf-8") as f:
old_content = f.read()
with open(filename, "r", encoding="utf-8") as f:
with open(folder / filename, "r", encoding="utf-8") as f:
new_content = f.read()
old_content_clean = clean_code(old_content)
@ -112,31 +161,6 @@ def diff_is_docstring_only(repo, branching_point, filename):
return old_content_clean == new_content_clean
def get_modified_python_files(diff_with_last_commit=False):
"""
Return a list of python files that have been modified between:
- the current head and the main branch if `diff_with_last_commit=False` (default)
- the current head and its parent commit otherwise.
"""
repo = Repo(PATH_TO_TRANFORMERS)
if not diff_with_last_commit:
print(f"main is at {repo.refs.main.commit}")
print(f"Current head is at {repo.head.commit}")
branching_commits = repo.merge_base(repo.refs.main, repo.head)
for commit in branching_commits:
print(f"Branching commit: {commit}")
return get_diff(repo, repo.head.commit, branching_commits)
else:
print(f"main is at {repo.head.commit}")
parent_commits = repo.head.commit.parents
for commit in parent_commits:
print(f"Parent commit: {commit}")
return get_diff(repo, repo.head.commit, parent_commits)
def get_diff(repo, base_commit, commits):
"""
Get's the diff between one or several commits and the head of the repository.
@ -166,96 +190,173 @@ def get_diff(repo, base_commit, commits):
return code_diff
def get_module_dependencies(module_fname):
def get_modified_python_files(diff_with_last_commit=False):
"""
Get the dependencies of a module.
Return a list of python files that have been modified between:
- the current head and the main branch if `diff_with_last_commit=False` (default)
- the current head and its parent commit otherwise.
"""
with open(os.path.join(PATH_TO_TRANFORMERS, module_fname), "r", encoding="utf-8") as f:
repo = Repo(PATH_TO_REPO)
if not diff_with_last_commit:
print(f"main is at {repo.refs.main.commit}")
print(f"Current head is at {repo.head.commit}")
branching_commits = repo.merge_base(repo.refs.main, repo.head)
for commit in branching_commits:
print(f"Branching commit: {commit}")
return get_diff(repo, repo.head.commit, branching_commits)
else:
print(f"main is at {repo.head.commit}")
parent_commits = repo.head.commit.parents
for commit in parent_commits:
print(f"Parent commit: {commit}")
return get_diff(repo, repo.head.commit, parent_commits)
# (:?^|\n) -> Non-catching group for the beginning of the doc or a new line.
# \s*from\s+(\.+\S+)\s+import\s+([^\n]+) -> Line only contains from .xxx import yyy and we catch .xxx and yyy
# (?=\n) -> Look-ahead to a new line. We can't just put \n here or using find_all on this re will only catch every
# other import.
_re_single_line_relative_imports = re.compile(r"(?:^|\n)\s*from\s+(\.+\S+)\s+import\s+([^\n]+)(?=\n)")
# (:?^|\n) -> Non-catching group for the beginning of the doc or a new line.
# \s*from\s+(\.+\S+)\s+import\s+\(([^\)]+)\) -> Line continues with from .xxx import (yyy) and we catch .xxx and yyy
# yyy will take multiple lines otherwise there wouldn't be parenthesis.
_re_multi_line_relative_imports = re.compile(r"(?:^|\n)\s*from\s+(\.+\S+)\s+import\s+\(([^\)]+)\)")
# (:?^|\n) -> Non-catching group for the beginning of the doc or a new line.
# \s*from\s+transformers(\S*)\s+import\s+([^\n]+) -> Line only contains from transformers.xxx import yyy and we catch
# .xxx and yyy
# (?=\n) -> Look-ahead to a new line. We can't just put \n here or using find_all on this re will only catch every
# other import.
_re_single_line_direct_imports = re.compile(r"(?:^|\n)\s*from\s+transformers(\S*)\s+import\s+([^\n]+)(?=\n)")
# (:?^|\n) -> Non-catching group for the beginning of the doc or a new line.
# \s*from\s+transformers(\S*)\s+import\s+\(([^\)]+)\) -> Line continues with from transformers.xxx import (yyy) and we
# catch .xxx and yyy. yyy will take multiple lines otherwise there wouldn't be parenthesis.
_re_multi_line_direct_imports = re.compile(r"(?:^|\n)\s*from\s+transformers(\S*)\s+import\s+\(([^\)]+)\)")
def extract_imports(module_fname, cache=None):
"""
Get the imports a given module makes. This takes a module filename and returns the list of module filenames
imported in the module with the objects imported in that module filename.
"""
if cache is not None and module_fname in cache:
return cache[module_fname]
with open(PATH_TO_REPO / module_fname, "r", encoding="utf-8") as f:
content = f.read()
module_parts = module_fname.split(os.path.sep)
# Filter out all docstrings to not get imports in code examples.
splits = content.split('"""')
content = "".join(splits[::2])
module_parts = str(module_fname).split(os.path.sep)
imported_modules = []
# Let's start with relative imports
relative_imports = re.findall(r"from\s+(\.+\S+)\s+import\s+([^\n]+)\n", content)
relative_imports = [mod for mod, imp in relative_imports if "# tests_ignore" not in imp]
for imp in relative_imports:
relative_imports = _re_single_line_relative_imports.findall(content)
relative_imports = [
(mod, imp) for mod, imp in relative_imports if "# tests_ignore" not in imp and imp.strip() != "("
]
multiline_relative_imports = _re_multi_line_relative_imports.findall(content)
relative_imports += [(mod, imp) for mod, imp in multiline_relative_imports if "# tests_ignore" not in imp]
for module, imports in relative_imports:
level = 0
while imp.startswith("."):
imp = imp[1:]
while module.startswith("."):
module = module[1:]
level += 1
if len(imp) > 0:
dep_parts = module_parts[: len(module_parts) - level] + imp.split(".")
if len(module) > 0:
dep_parts = module_parts[: len(module_parts) - level] + module.split(".")
else:
dep_parts = module_parts[: len(module_parts) - level] + ["__init__.py"]
dep_parts = module_parts[: len(module_parts) - level]
imported_module = os.path.sep.join(dep_parts)
# We ignore the main init import as it's only for the __version__ that it's done
# and it would add everything as a dependency.
if not imported_module.endswith("transformers/__init__.py"):
imported_modules.append(imported_module)
imported_modules.append((imported_module, [imp.strip() for imp in imports.split(",")]))
# Let's continue with direct imports
# The import from the transformers module are ignored for the same reason we ignored the
# main init before.
direct_imports = re.findall(r"from\s+transformers\.(\S+)\s+import\s+([^\n]+)\n", content)
direct_imports = [mod for mod, imp in direct_imports if "# tests_ignore" not in imp]
for imp in direct_imports:
import_parts = imp.split(".")
direct_imports = _re_single_line_direct_imports.findall(content)
direct_imports = [(mod, imp) for mod, imp in direct_imports if "# tests_ignore" not in imp and imp.strip() != "("]
multiline_direct_imports = _re_multi_line_direct_imports.findall(content)
direct_imports += [(mod, imp) for mod, imp in multiline_direct_imports if "# tests_ignore" not in imp]
for module, imports in direct_imports:
import_parts = module.split(".")[1:] # ignore the first .
dep_parts = ["src", "transformers"] + import_parts
imported_modules.append(os.path.sep.join(dep_parts))
imported_module = os.path.sep.join(dep_parts)
imported_modules.append((imported_module, [imp.strip() for imp in imports.split(",")]))
# Now let's just check that we have proper module files, or append an init for submodules
result = []
for module_file, imports in imported_modules:
if (PATH_TO_REPO / f"{module_file}.py").is_file():
module_file = f"{module_file}.py"
elif (PATH_TO_REPO / module_file).is_dir() and (PATH_TO_REPO / module_file / "__init__.py").is_file():
module_file = os.path.sep.join([module_file, "__init__.py"])
imports = [imp for imp in imports if len(imp) > 0 and re.match("^[A-Za-z0-9_]*$", imp)]
if len(imports) > 0:
result.append((module_file, imports))
if cache is not None:
cache[module_fname] = result
return result
def get_module_dependencies(module_fname, cache=None):
"""
Get the dependencies of a module from the module filename as a list of module filenames. This will resolve any
__init__ we pass: if we import from a submodule utils, the dependencies will be utils/foo.py and utils/bar.py (if
the objects imported actually come from utils.foo and utils.bar) not utils/__init__.py.
"""
dependencies = []
for imported_module in imported_modules:
if os.path.isfile(os.path.join(PATH_TO_TRANFORMERS, f"{imported_module}.py")):
dependencies.append(f"{imported_module}.py")
elif os.path.isdir(os.path.join(PATH_TO_TRANFORMERS, imported_module)) and os.path.isfile(
os.path.sep.join([PATH_TO_TRANFORMERS, imported_module, "__init__.py"])
):
dependencies.append(os.path.sep.join([imported_module, "__init__.py"]))
imported_modules = extract_imports(module_fname, cache=cache)
# The while loop is to recursively traverse all inits we may encounter.
while len(imported_modules) > 0:
new_modules = []
for module, imports in imported_modules:
# If we end up in an __init__ we are often not actually importing from this init (except in the case where
# the object is fully defined in the __init__)
if module.endswith("__init__.py"):
# So we get the imports from that init then try to find where our objects come from.
new_imported_modules = extract_imports(module, cache=cache)
for new_module, new_imports in new_imported_modules:
if any([i in new_imports for i in imports]):
if new_module not in dependencies:
new_modules.append((new_module, [i for i in new_imports if i in imports]))
imports = [i for i in imports if i not in new_imports]
if len(imports) > 0:
# If there are any objects lefts, they may be a submodule
path_to_module = PATH_TO_REPO / module.replace("__init__.py", "")
dependencies.extend(
[
os.path.join(module.replace("__init__.py", ""), f"{i}.py")
for i in imports
if (path_to_module / f"{i}.py").is_file()
]
)
imports = [i for i in imports if not (path_to_module / f"{i}.py").is_file()]
if len(imports) > 0:
# Then if there are still objects left, they are fully defined in the init, so we keep it as a
# dependency.
dependencies.append(module)
else:
dependencies.append(module)
imported_modules = new_modules
return dependencies
def get_test_dependencies(test_fname):
"""
Get the dependencies of a test file.
"""
with open(os.path.join(PATH_TO_TRANFORMERS, test_fname), "r", encoding="utf-8") as f:
content = f.read()
# Tests only have relative imports for other test files
# TODO Sylvain: handle relative imports cleanly
relative_imports = re.findall(r"from\s+(\.\S+)\s+import\s+([^\n]+)\n", content)
relative_imports = [test for test, imp in relative_imports if "# tests_ignore" not in imp]
def _convert_relative_import_to_file(relative_import):
level = 0
while relative_import.startswith("."):
level += 1
relative_import = relative_import[1:]
directory = os.path.sep.join(test_fname.split(os.path.sep)[:-level])
return os.path.join(directory, f"{relative_import.replace('.', os.path.sep)}.py")
dependencies = [_convert_relative_import_to_file(relative_import) for relative_import in relative_imports]
return [f for f in dependencies if os.path.isfile(os.path.join(PATH_TO_TRANFORMERS, f))]
def create_reverse_dependency_tree():
"""
Create a list of all edges (a, b) which mean that modifying a impacts b with a going over all module and test files.
"""
modules = [
str(f.relative_to(PATH_TO_TRANFORMERS))
for f in (Path(PATH_TO_TRANFORMERS) / "src/transformers").glob("**/*.py")
]
module_edges = [(d, m) for m in modules for d in get_module_dependencies(m)]
cache = {}
all_modules = list(PATH_TO_TRANFORMERS.glob("**/*.py")) + list(PATH_TO_TESTS.glob("**/*.py"))
all_modules = [str(mod.relative_to(PATH_TO_REPO)) for mod in all_modules]
edges = [(dep, mod) for mod in all_modules for dep in get_module_dependencies(mod, cache=cache)]
tests = [str(f.relative_to(PATH_TO_TRANFORMERS)) for f in (Path(PATH_TO_TRANFORMERS) / "tests").glob("**/*.py")]
test_edges = [(d, t) for t in tests for d in get_test_dependencies(t)]
return module_edges + test_edges
return list(set(edges))
def get_tree_starting_at(module, edges):
@ -264,13 +365,17 @@ def get_tree_starting_at(module, edges):
starting at module], [list of edges starting at the preceding level], ...]
"""
vertices_seen = [module]
new_edges = [edge for edge in edges if edge[0] == module and edge[1] != module]
new_edges = [edge for edge in edges if edge[0] == module and edge[1] != module and "__init__.py" not in edge[1]]
tree = [module]
while len(new_edges) > 0:
tree.append(new_edges)
final_vertices = list({edge[1] for edge in new_edges})
vertices_seen.extend(final_vertices)
new_edges = [edge for edge in edges if edge[0] in final_vertices and edge[1] not in vertices_seen]
new_edges = [
edge
for edge in edges
if edge[0] in final_vertices and edge[1] not in vertices_seen and "__init__.py" not in edge[1]
]
return tree
@ -308,290 +413,159 @@ def create_reverse_dependency_map():
Create the dependency map from module/test filename to the list of modules/tests that depend on it (even
recursively).
"""
modules = [
str(f.relative_to(PATH_TO_TRANFORMERS))
for f in (Path(PATH_TO_TRANFORMERS) / "src/transformers").glob("**/*.py")
]
# We grab all the dependencies of each module.
direct_deps = {m: get_module_dependencies(m) for m in modules}
# We add all the dependencies of each test file
tests = [str(f.relative_to(PATH_TO_TRANFORMERS)) for f in (Path(PATH_TO_TRANFORMERS) / "tests").glob("**/*.py")]
direct_deps.update({t: get_test_dependencies(t) for t in tests})
all_files = modules + tests
cache = {}
all_modules = list(PATH_TO_TRANFORMERS.glob("**/*.py")) + list(PATH_TO_TESTS.glob("**/*.py"))
all_modules = [str(mod.relative_to(PATH_TO_REPO)) for mod in all_modules]
direct_deps = {m: get_module_dependencies(m, cache=cache) for m in all_modules}
# This recurses the dependencies
something_changed = True
while something_changed:
something_changed = False
for m in all_files:
for m in all_modules:
for d in direct_deps[m]:
if d.endswith("__init__.py"):
continue
if d not in direct_deps:
raise ValueError(f"KeyError:{d}. From {m}")
for dep in direct_deps[d]:
if dep not in direct_deps[m]:
direct_deps[m].append(dep)
new_deps = set(direct_deps[d]) - set(direct_deps[m])
if len(new_deps) > 0:
direct_deps[m].extend(list(new_deps))
something_changed = True
# Finally we can build the reverse map.
reverse_map = collections.defaultdict(list)
for m in all_files:
if m.endswith("__init__.py"):
reverse_map[m].extend(direct_deps[m])
for m in all_modules:
for d in direct_deps[m]:
reverse_map[d].append(m)
for m in [f for f in all_modules if f.endswith("__init__.py")]:
direct_deps = get_module_dependencies(m, cache=cache)
deps = sum([reverse_map[d] for d in direct_deps if not d.endswith("__init__.py")], direct_deps)
reverse_map[m] = list(set(deps) - {m})
return reverse_map
# Any module file that has a test name which can't be inferred automatically from its name should go here. A better
# approach is to (re-)name the test file accordingly, and second best to add the correspondence map here.
SPECIAL_MODULE_TO_TEST_MAP = {
"commands/add_new_model_like.py": "utils/test_add_new_model_like.py",
"configuration_utils.py": "test_configuration_common.py",
"convert_graph_to_onnx.py": "onnx/test_onnx.py",
"data/data_collator.py": "trainer/test_data_collator.py",
"deepspeed.py": "deepspeed/",
"feature_extraction_sequence_utils.py": "test_sequence_feature_extraction_common.py",
"feature_extraction_utils.py": "test_feature_extraction_common.py",
"file_utils.py": ["utils/test_file_utils.py", "utils/test_model_output.py"],
"image_processing_utils.py": ["test_image_processing_common.py", "utils/test_image_processing_utils.py"],
"image_transforms.py": "test_image_transforms.py",
"utils/generic.py": ["utils/test_file_utils.py", "utils/test_model_output.py", "utils/test_generic.py"],
"utils/hub.py": "utils/test_hub_utils.py",
"modelcard.py": "utils/test_model_card.py",
"modeling_flax_utils.py": "test_modeling_flax_common.py",
"modeling_tf_utils.py": ["test_modeling_tf_common.py", "utils/test_modeling_tf_core.py"],
"modeling_utils.py": ["test_modeling_common.py", "utils/test_offline.py"],
"models/auto/modeling_auto.py": [
"models/auto/test_modeling_auto.py",
"models/auto/test_modeling_tf_pytorch.py",
"models/bort/test_modeling_bort.py",
"models/dit/test_modeling_dit.py",
],
"models/auto/modeling_flax_auto.py": "models/auto/test_modeling_flax_auto.py",
"models/auto/modeling_tf_auto.py": [
"models/auto/test_modeling_tf_auto.py",
"models/auto/test_modeling_tf_pytorch.py",
"models/bort/test_modeling_tf_bort.py",
],
"models/gpt2/modeling_gpt2.py": [
"models/gpt2/test_modeling_gpt2.py",
"models/megatron_gpt2/test_modeling_megatron_gpt2.py",
],
"models/dpt/modeling_dpt.py": [
"models/dpt/test_modeling_dpt.py",
"models/dpt/test_modeling_dpt_hybrid.py",
],
"optimization.py": "optimization/test_optimization.py",
"optimization_tf.py": "optimization/test_optimization_tf.py",
"pipelines/__init__.py": all_pipeline_test_files + all_model_test_files,
"pipelines/base.py": all_pipeline_test_files + all_model_test_files,
"pipelines/text2text_generation.py": [
"pipelines/test_pipelines_text2text_generation.py",
"pipelines/test_pipelines_summarization.py",
"pipelines/test_pipelines_translation.py",
],
"pipelines/zero_shot_classification.py": "pipelines/test_pipelines_zero_shot.py",
"testing_utils.py": "utils/test_skip_decorators.py",
"tokenization_utils.py": ["test_tokenization_common.py", "tokenization/test_tokenization_utils.py"],
"tokenization_utils_base.py": ["test_tokenization_common.py", "tokenization/test_tokenization_utils.py"],
"tokenization_utils_fast.py": [
"test_tokenization_common.py",
"tokenization/test_tokenization_utils.py",
"tokenization/test_tokenization_fast.py",
],
"trainer.py": [
"trainer/test_trainer.py",
"extended/test_trainer_ext.py",
"trainer/test_trainer_distributed.py",
"trainer/test_trainer_tpu.py",
],
"train_pt_utils.py": "trainer/test_trainer_utils.py",
"utils/versions.py": "utils/test_versions_utils.py",
}
def module_to_test_file(module_fname):
def create_module_to_test_map(reverse_map=None, filter_models=False):
"""
Returns the name of the file(s) where `module_fname` is tested.
Extract the tests from the reverse_dependency_map and potentially filters the model tests.
"""
splits = module_fname.split(os.path.sep)
if reverse_map is None:
reverse_map = create_reverse_dependency_map()
test_map = {module: [f for f in deps if f.startswith("tests")] for module, deps in reverse_map.items()}
# Special map has priority
short_name = os.path.sep.join(splits[2:])
if short_name in SPECIAL_MODULE_TO_TEST_MAP:
test_file = SPECIAL_MODULE_TO_TEST_MAP[short_name]
if isinstance(test_file, str):
return f"tests/{test_file}"
return [f"tests/{f}" for f in test_file]
if not filter_models:
return test_map
module_name = splits[-1]
# Fast tokenizers are tested in the same file as the slow ones.
if module_name.endswith("_fast.py"):
module_name = module_name.replace("_fast.py", ".py")
num_model_tests = len(list(PATH_TO_TESTS.glob("models/*")))
# Special case for pipelines submodules
if len(splits) >= 2 and splits[-2] == "pipelines":
default_test_file = f"tests/pipelines/test_pipelines_{module_name}"
return [default_test_file] + all_model_test_files
# Special case for benchmarks submodules
elif len(splits) >= 2 and splits[-2] == "benchmark":
return ["tests/benchmark/test_benchmark.py", "tests/benchmark/test_benchmark_tf.py"]
# Special case for commands submodules
elif len(splits) >= 2 and splits[-2] == "commands":
return "tests/utils/test_cli.py"
# Special case for onnx submodules
elif len(splits) >= 2 and splits[-2] == "onnx":
return ["tests/onnx/test_features.py", "tests/onnx/test_onnx.py", "tests/onnx/test_onnx_v2.py"]
# Special case for utils (not the one in src/transformers, the ones at the root of the repo).
elif len(splits) > 0 and splits[0] == "utils":
default_test_file = f"tests/repo_utils/test_{module_name}"
elif len(splits) > 4 and splits[2] == "models":
default_test_file = f"tests/models/{splits[3]}/test_{module_name}"
elif len(splits) > 2 and splits[2].startswith("generation"):
default_test_file = f"tests/generation/test_{module_name}"
elif len(splits) > 2 and splits[2].startswith("trainer"):
default_test_file = f"tests/trainer/test_{module_name}"
else:
default_test_file = f"tests/utils/test_{module_name}"
def has_many_models(tests):
model_tests = {Path(t).parts[2] for t in tests if t.startswith("tests/models/")}
return len(model_tests) > num_model_tests // 2
if os.path.isfile(default_test_file):
return default_test_file
def filter_tests(tests):
return [t for t in tests if not t.startswith("tests/models/") or Path(t).parts[2] in IMPORTANT_MODELS]
# Processing -> processor
if "processing" in default_test_file:
test_file = default_test_file.replace("processing", "processor")
if os.path.isfile(test_file):
return test_file
return {module: (filter_tests(tests) if has_many_models(tests) else tests) for module, tests in test_map.items()}
# This list contains the list of test files we expect never to be launched from a change in a module/util. Those are
# launched separately.
EXPECTED_TEST_FILES_NEVER_TOUCHED = [
"tests/generation/test_framework_agnostic.py", # Mixins inherited by actual test classes
"tests/mixed_int8/test_mixed_int8.py", # Mixed-int8 bitsandbytes test
"tests/pipelines/test_pipelines_common.py", # Actually checked by the pipeline based file
"tests/sagemaker/test_single_node_gpu.py", # SageMaker test
"tests/sagemaker/test_multi_node_model_parallel.py", # SageMaker test
"tests/sagemaker/test_multi_node_data_parallel.py", # SageMaker test
"tests/test_pipeline_mixin.py", # Contains no test of its own (only the common tester class)
"tests/utils/test_doc_samples.py", # Doc tests
]
def check_imports_all_exist():
"""
Isn't used per se by the test fetcher but might be used later as a quality check. Putting this here for now so the
code is not lost.
"""
cache = {}
all_modules = list(PATH_TO_TRANFORMERS.glob("**/*.py")) + list(PATH_TO_TESTS.glob("**/*.py"))
all_modules = [str(mod.relative_to(PATH_TO_REPO)) for mod in all_modules]
direct_deps = {m: get_module_dependencies(m, cache=cache) for m in all_modules}
for module, deps in direct_deps.items():
for dep in deps:
if not (PATH_TO_REPO / dep).is_file():
print(f"{module} has dependency on {dep} which does not exist.")
def _print_list(l):
return "\n".join([f"- {f}" for f in l])
def sanity_check():
"""
Checks that all test files can be touched by a modification in at least one module/utils. This test ensures that
newly-added test files are properly mapped to some module or utils, so they can be run by the CI.
"""
# Grab all module and utils
all_files = [
str(p.relative_to(PATH_TO_TRANFORMERS))
for p in (Path(PATH_TO_TRANFORMERS) / "src/transformers").glob("**/*.py")
]
all_files += [
str(p.relative_to(PATH_TO_TRANFORMERS)) for p in (Path(PATH_TO_TRANFORMERS) / "utils").glob("**/*.py")
]
def create_json_map(test_files_to_run, json_output_file):
if json_output_file is None:
return
# Compute all the test files we get from those.
test_files_found = []
for f in all_files:
test_f = module_to_test_file(f)
if test_f is not None:
if isinstance(test_f, str):
test_files_found.append(test_f)
test_map = {}
for test_file in test_files_to_run:
# `test_file` is a path to a test folder/file, starting with `tests/`. For example,
# - `tests/models/bert/test_modeling_bert.py` or `tests/models/bert`
# - `tests/trainer/test_trainer.py` or `tests/trainer`
# - `tests/test_modeling_common.py`
names = test_file.split(os.path.sep)
if names[1] == "models":
# take the part like `models/bert` for modeling tests
key = os.path.sep.join(names[1:3])
elif len(names) > 2 or not test_file.endswith(".py"):
# test folders under `tests` or python files under them
# take the part like tokenization, `pipeline`, etc. for other test categories
key = os.path.sep.join(names[1:2])
else:
test_files_found.extend(test_f)
# common test files directly under `tests/`
key = "common"
# Some of the test files might actually be subfolders so we grab the tests inside.
test_files = []
for test_f in test_files_found:
if os.path.isdir(os.path.join(PATH_TO_TRANFORMERS, test_f)):
test_files.extend(
[
str(p.relative_to(PATH_TO_TRANFORMERS))
for p in (Path(PATH_TO_TRANFORMERS) / test_f).glob("**/test*.py")
]
)
else:
test_files.append(test_f)
if key not in test_map:
test_map[key] = []
test_map[key].append(test_file)
# Compare to existing test files
existing_test_files = [
str(p.relative_to(PATH_TO_TRANFORMERS)) for p in (Path(PATH_TO_TRANFORMERS) / "tests").glob("**/test*.py")
]
not_touched_test_files = [f for f in existing_test_files if f not in test_files]
should_be_tested = set(not_touched_test_files) - set(EXPECTED_TEST_FILES_NEVER_TOUCHED)
if len(should_be_tested) > 0:
raise ValueError(
"The following test files are not currently associated with any module or utils files, which means they "
f"will never get run by the CI:\n{_print_list(should_be_tested)}\n. Make sure the names of these test "
"files match the name of the module or utils they are testing, or adapt the constant "
"`SPECIAL_MODULE_TO_TEST_MAP` in `utils/tests_fetcher.py` to add them. If your test file is triggered "
"separately and is not supposed to be run by the regular CI, add it to the "
"`EXPECTED_TEST_FILES_NEVER_TOUCHED` constant instead."
)
# sort the keys & values
keys = sorted(test_map.keys())
test_map = {k: " ".join(sorted(test_map[k])) for k in keys}
with open(json_output_file, "w", encoding="UTF-8") as fp:
json.dump(test_map, fp, ensure_ascii=False)
def infer_tests_to_run(output_file, diff_with_last_commit=False, filters=None, json_output_file=None):
def infer_tests_to_run(
output_file, diff_with_last_commit=False, filters=None, filter_models=True, json_output_file=None
):
modified_files = get_modified_python_files(diff_with_last_commit=diff_with_last_commit)
print(f"\n### MODIFIED FILES ###\n{_print_list(modified_files)}")
# Create the map that will give us all impacted modules.
impacted_modules_map = create_reverse_dependency_map()
reverse_map = create_reverse_dependency_map()
impacted_files = modified_files.copy()
for f in modified_files:
if f in impacted_modules_map:
impacted_files.extend(impacted_modules_map[f])
if f in reverse_map:
impacted_files.extend(reverse_map[f])
# Remove duplicates
impacted_files = sorted(set(impacted_files))
print(f"\n### IMPACTED FILES ###\n{_print_list(impacted_files)}")
# Grab the corresponding test files:
if "setup.py" in impacted_files:
if "setup.py" in modified_files:
test_files_to_run = ["tests"]
repo_utils_launch = True
else:
# Grab the corresponding test files:
test_files_to_run = []
for f in impacted_files:
# Modified test files are always added
if f.startswith("tests/"):
test_files_to_run.append(f)
# Example files are tested separately
elif f.startswith("examples/pytorch"):
test_files_to_run.append("examples/pytorch/test_pytorch_examples.py")
test_files_to_run.append("examples/pytorch/test_accelerate_examples.py")
elif f.startswith("examples/tensorflow"):
test_files_to_run.append("examples/tensorflow/test_tensorflow_examples.py")
elif f.startswith("examples/flax"):
test_files_to_run.append("examples/flax/test_flax_examples.py")
else:
new_tests = module_to_test_file(f)
if new_tests is not None:
if isinstance(new_tests, str):
test_files_to_run.append(new_tests)
else:
test_files_to_run.extend(new_tests)
# Remove duplicates
# All modified tests need to be run.
test_files_to_run = [
f for f in modified_files if f.startswith("tests") and f.split(os.path.sep)[-1].startswith("test")
]
# Then we grab the corresponding test files.
test_map = create_module_to_test_map(reverse_map=reverse_map, filter_models=filter_models)
for f in modified_files:
if f in test_map:
test_files_to_run.extend(test_map[f])
test_files_to_run = sorted(set(test_files_to_run))
# Remove SageMaker tests
test_files_to_run = [f for f in test_files_to_run if not f.split(os.path.sep)[1] == "sagemaker"]
# Make sure we did not end up with a test file that was removed
test_files_to_run = [f for f in test_files_to_run if os.path.isfile(f) or os.path.isdir(f)]
test_files_to_run = [f for f in test_files_to_run if (PATH_TO_REPO / f).exists()]
if filters is not None:
filtered_files = []
for filter in filters:
filtered_files.extend([f for f in test_files_to_run if f.startswith(filter)])
for _filter in filters:
filtered_files.extend([f for f in test_files_to_run if f.startswith(_filter)])
test_files_to_run = filtered_files
repo_utils_launch = any(f.split(os.path.sep)[1] == "repo_utils" for f in test_files_to_run)
repo_utils_launch = any(f.split(os.path.sep)[1] == "repo_utils" for f in modified_files)
if repo_utils_launch:
repo_util_file = Path(output_file).parent / "test_repo_utils.txt"
@ -610,34 +584,7 @@ def infer_tests_to_run(output_file, diff_with_last_commit=False, filters=None, j
if "tests" in test_files_to_run:
test_files_to_run = get_all_tests()
if json_output_file is not None:
test_map = {}
for test_file in test_files_to_run:
# `test_file` is a path to a test folder/file, starting with `tests/`. For example,
# - `tests/models/bert/test_modeling_bert.py` or `tests/models/bert`
# - `tests/trainer/test_trainer.py` or `tests/trainer`
# - `tests/test_modeling_common.py`
names = test_file.split(os.path.sep)
if names[1] == "models":
# take the part like `models/bert` for modeling tests
key = "/".join(names[1:3])
elif len(names) > 2 or not test_file.endswith(".py"):
# test folders under `tests` or python files under them
# take the part like tokenization, `pipeline`, etc. for other test categories
key = "/".join(names[1:2])
else:
# common test files directly under `tests/`
key = "common"
if key not in test_map:
test_map[key] = []
test_map[key].append(test_file)
# sort the keys & values
keys = sorted(test_map.keys())
test_map = {k: " ".join(sorted(test_map[k])) for k in keys}
with open(json_output_file, "w", encoding="UTF-8") as fp:
json.dump(test_map, fp, ensure_ascii=False)
create_json_map(test_files_to_run, json_output_file)
def filter_tests(output_file, filters):
@ -667,11 +614,29 @@ def filter_tests(output_file, filters):
f.write(" ".join(test_files))
def parse_commit_message(commit_message):
"""
Parses the commit message to detect if a command is there to skip, force all or part of the CI.
Returns a dictionary of strings to bools with keys skip, test_all_models and test_all.
"""
if commit_message is None:
return {"skip": False, "no_filter": False, "test_all": False}
command_search = re.search(r"\[([^\]]*)\]", commit_message)
if command_search is not None:
command = command_search.groups()[0]
command = command.lower().replace("-", " ").replace("_", " ")
skip = command in ["ci skip", "skip ci", "circleci skip", "skip circleci"]
no_filter = set(command.split(" ")) == {"no", "filter"}
test_all = set(command.split(" ")) == {"test", "all"}
return {"skip": skip, "no_filter": no_filter, "test_all": test_all}
else:
return {"skip": False, "no_filter": False, "test_all": False}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--sanity_check", action="store_true", help="Only test that all tests and modules are accounted for."
)
parser.add_argument(
"--output_file", type=str, default="test_list.txt", help="Where to store the list of tests to run"
)
@ -704,33 +669,54 @@ if __name__ == "__main__":
help="Will only print the tree of modules depending on the file passed.",
default=None,
)
parser.add_argument(
"--commit_message",
type=str,
help="The commit message (which could contain a command to force all tests or skip the CI).",
default=None,
)
args = parser.parse_args()
if args.print_dependencies_of is not None:
print_tree_deps_of(args.print_dependencies_of)
elif args.sanity_check:
sanity_check()
elif args.filter_tests:
filter_tests(args.output_file, ["pipelines", "repo_utils"])
else:
repo = Repo(PATH_TO_TRANFORMERS)
repo = Repo(PATH_TO_REPO)
commit_message = repo.head.commit.message
commit_flags = parse_commit_message(commit_message)
if commit_flags["skip"]:
print("Force-skipping the CI")
quit()
if commit_flags["no_filter"]:
print("Running all tests fetched without filtering.")
if commit_flags["test_all"]:
print("Force-launching all tests")
diff_with_last_commit = args.diff_with_last_commit
if not diff_with_last_commit and not repo.head.is_detached and repo.head.ref == repo.refs.main:
print("main branch detected, fetching tests against last commit.")
diff_with_last_commit = True
if not commit_flags["test_all"]:
try:
infer_tests_to_run(
args.output_file,
diff_with_last_commit=diff_with_last_commit,
filters=args.filters,
json_output_file=args.json_output_file,
filter_models=not commit_flags["no_filter"],
)
filter_tests(args.output_file, ["repo_utils"])
except Exception as e:
print(f"\nError when trying to grab the relevant tests: {e}\n\nRunning all tests.")
commit_flags["test_all"] = True
if commit_flags["test_all"]:
with open(args.output_file, "w", encoding="utf-8") as f:
if args.filters is None:
f.write("./tests/")
else:
f.write(" ".join(args.filters))
test_files_to_run = get_all_tests()
create_json_map(test_files_to_run, args.json_output_file)