Fix doctest files fetch issue (#23277)

* fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2025-07-04 21:30:07 +06:00 · 2023-05-11 17:14:06 +02:00 · 2023-05-11 17:14:06 +02:00 · 6a6225beab
commit 6a6225beab
parent 5d02e6bd20
3 changed files with 78 additions and 16 deletions
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -432,10 +432,13 @@ repo_utils_job = CircleCIJob(
    tests_to_run="tests/repo_utils",
 )
-# At this moment, only the files that are in `utils/documentation_tests.txt` will be kept (together with a dummy file).
+
-py_command = 'import os; import json; fp = open("pr_documentation_tests.txt"); data_1 = fp.read().strip().split("\\n"); fp = open("utils/documentation_tests.txt"); data_2 = fp.read().strip().split("\\n"); to_test = [x for x in data_1 if x in set(data_2)] + ["dummy.py"]; to_test = " ".join(to_test); print(to_test)'
+# We also include a `dummy.py` file in the files to be doc-tested to prevent edge case failure. Otherwise, the pytest
 # hangs forever during test collection while showing `collecting 0 items / 21 errors`. (To see this, we have to remove
 # the bash output redirection.)
 py_command = 'from utils.tests_fetcher import get_doctest_files; to_test = get_doctest_files() + ["dummy.py"]; to_test = " ".join(to_test); print(to_test)'
 py_command = f"$(python3 -c '{py_command}')"
-command = f'echo "{py_command}" > pr_documentation_tests_filtered.txt'
+command = f'echo "{py_command}" > pr_documentation_tests_temp.txt'
 doc_test_job = CircleCIJob(
    "pr_documentation_tests",
    additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"},
@ -451,27 +454,20 @@ doc_test_job = CircleCIJob(
        "touch dummy.py",
        {
            "name": "Get files to test",
-            "command":
+            "command": command,
                "git remote add upstream https://github.com/huggingface/transformers.git && git fetch upstream \n"
                "git diff --name-only --relative --diff-filter=AMR refs/remotes/upstream/main...HEAD | grep -E '\.(py|mdx)$' | grep -Ev '^\..*|/\.' | grep -Ev '__' > pr_documentation_tests.txt"
        },
        {
-            "name": "List files beings changed: pr_documentation_tests.txt",
+            "name": "Show information in `Get files to test`",
            "command":
-                "cat pr_documentation_tests.txt"
+                "cat pr_documentation_tests_temp.txt"
        },
        {
-            "name": "Filter pr_documentation_tests.txt",
+            "name": "Get the last line in `pr_documentation_tests.txt`",
            "command":
-                command
+                "tail -n1 pr_documentation_tests_temp.txt | tee pr_documentation_tests.txt"
        },
        {
            "name": "List files beings tested: pr_documentation_tests_filtered.txt",
            "command":
                "cat pr_documentation_tests_filtered.txt"
        },
    ],
-    tests_to_run="$(cat pr_documentation_tests_filtered.txt)",  # noqa
+    tests_to_run="$(cat pr_documentation_tests.txt)",  # noqa
    pytest_options={"-doctest-modules": None, "doctest-glob": "*.mdx", "dist": "loadfile", "rvsA": None},
    command_timeout=1200,  # test cannot run longer than 1200 seconds
    pytest_num_workers=1,
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@ -25,11 +25,17 @@ jobs:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: uninstall transformers (installed during docker image build)
        run: python3 -m pip uninstall -y transformers
      - uses: actions/checkout@v3
      - name: NVIDIA-SMI
        run: |
          nvidia-smi
      - name: Install transformers in edit mode
        run: python3 -m pip install -e .
      - name: GPU visibility
        run: |
          python3 utils/print_env.py
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@ -216,6 +216,66 @@ def get_modified_python_files(diff_with_last_commit=False):
        return get_diff(repo, repo.head.commit, parent_commits)
 def get_diff_for_py_and_mdx_files(repo, base_commit, commits):
    """
    Get's the diff between one or several commits and the head of the repository.
    """
    print("\n### DIFF ###\n")
    code_diff = []
    for commit in commits:
        for diff_obj in commit.diff(base_commit):
            # We always add new python files
            if diff_obj.change_type in ["A", "M", "R"] and (
                diff_obj.b_path.endswith(".py") or diff_obj.b_path.endswith(".mdx")
            ):
                code_diff.append(diff_obj.b_path)
    return code_diff
 def get_modified_python_and_mdx_files(diff_with_last_commit=False):
    """
    Return a list of python and mdx files that have been modified between:
    - the current head and the main branch if `diff_with_last_commit=False` (default)
    - the current head and its parent commit otherwise.
    """
    repo = Repo(PATH_TO_REPO)
    if not diff_with_last_commit:
        print(f"main is at {repo.refs.main.commit}")
        print(f"Current head is at {repo.head.commit}")
        branching_commits = repo.merge_base(repo.refs.main, repo.head)
        for commit in branching_commits:
            print(f"Branching commit: {commit}")
        return get_diff_for_py_and_mdx_files(repo, repo.head.commit, branching_commits)
    else:
        print(f"main is at {repo.head.commit}")
        parent_commits = repo.head.commit.parents
        for commit in parent_commits:
            print(f"Parent commit: {commit}")
        return get_diff_for_py_and_mdx_files(repo, repo.head.commit, parent_commits)
 def get_doctest_files(diff_with_last_commit=False):
    """
    Return a list of python and mdx files that have been modified between:
    - the current head and the main branch if `diff_with_last_commit=False` (default)
    - the current head and its parent commit otherwise.
    """
    test_files_to_run = get_modified_python_and_mdx_files(diff_with_last_commit)
    with open("utils/documentation_tests.txt") as fp:
        documentation_tests = set(fp.read().strip().split("\n"))
    # So far we don't have 100% coverage for doctest. This line will be removed once we achieve 100%.
    test_files_to_run = [x for x in test_files_to_run if x in documentation_tests]
    # Make sure we did not end up with a test file that was removed
    test_files_to_run = [f for f in test_files_to_run if (PATH_TO_REPO / f).exists()]
    return test_files_to_run
 # (:?^|\n) -> Non-catching group for the beginning of the doc or a new line.
 # \s*from\s+(\.+\S+)\s+import\s+([^\n]+) -> Line only contains from .xxx import yyy and we catch .xxx and yyy
 # (?=\n) -> Look-ahead to a new line. We can't just put \n here or using find_all on this re will only catch every