More utils doc (#25457)

* Document and clean more utils. * More documentation and fixes * Switch to Lysandre's token * Address review comments * Actually put else
2025-07-31 02:02:21 +06:00 · 2023-08-17 07:58:35 +02:00 · 2023-08-17 07:58:35 +02:00 · 2defb6b048
commit 2defb6b048
parent 36f183ebab
9 changed files with 411 additions and 84 deletions
--- a/.github/workflows/update_metdata.yml
+++ b/.github/workflows/update_metdata.yml
@ -24,4 +24,4 @@ jobs:

      - name: Update metadata
        run: |
-          python utils/update_metadata.py --token ${{ secrets.SYLVAIN_HF_TOKEN }} --commit_sha ${{ github.sha }}
+          python utils/update_metadata.py --token ${{ secrets.LYSANDRE_HF_TOKEN }} --commit_sha ${{ github.sha }}
--- a/setup.py
+++ b/setup.py
@ -17,25 +17,26 @@ Simple check list from AllenNLP repo: https://github.com/allenai/allennlp/blob/m

 To create the package for pypi.

-1. Run `make pre-release` (or `make pre-patch` for a patch release) then run `make fix-copies` to fix the index of the
-   documentation.
+1. Create the release branch named: v<RELEASE>-release, for example v4.19-release. For a patch release checkout the
+   current release branch.

   If releasing on a special branch, copy the updated README.md on the main branch for your the commit you will make
   for the post-release and run `make fix-copies` on the main branch as well.

-2. Run Tests for Amazon Sagemaker. The documentation is located in `./tests/sagemaker/README.md`, otherwise @philschmid.
+2. Run `make pre-release` (or `make pre-patch` for a patch release) and commit these changes with the message:
+   "Release: <VERSION>" and push.

-3. Unpin specific versions from setup.py that use a git install.
+3. Go back to the main branch and run `make post-release` then `make fix-copies`. Commit these changes with the
+   message "v<NEXT_VERSION>.dev.0" and push to main.

-4. Checkout the release branch (v<RELEASE>-release, for example v4.19-release), and commit these changes with the
-   message: "Release: <VERSION>" and push.
+# If you were just cutting the branch in preparation for a release, you can stop here for now.

-5. Wait for the tests on main to be completed and be green (otherwise revert and fix bugs)
+4. Wait for the tests on the release branch to be completed and be green (otherwise revert and fix bugs)

-6. Add a tag in git to mark the release: "git tag v<VERSION> -m 'Adds tag v<VERSION> for pypi' "
+5. On the release branch, add a tag in git to mark the release: "git tag v<VERSION> -m 'Adds tag v<VERSION> for pypi' "
   Push the tag to git: git push --tags origin v<RELEASE>-release

-7. Build both the sources and the wheel. Do not change anything in setup.py between
+6. Build both the sources and the wheel. Do not change anything in setup.py between
   creating the wheel and the source distribution (obviously).

   Run `make build-release`. This will build the release and do some sanity checks for you. If this ends with an error
@ -43,7 +44,7 @@ To create the package for pypi.

   You should now have a /dist directory with both .whl and .tar.gz source versions.

-8. Check that everything looks correct by uploading the package to the pypi test server:
+7. Check that everything looks correct by uploading the package to the pypi test server:

   twine upload dist/* -r testpypi
   (pypi suggest using twine as other methods upload files via plaintext.)
@ -60,13 +61,10 @@ To create the package for pypi.

   If making a patch release, double check the bug you are patching is indeed resolved.

-9. Upload the final version to actual pypi:
+8. Upload the final version to actual pypi:
   twine upload dist/* -r pypi

-10. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
-
-11. Run `make post-release` then run `make fix-copies`. If you were on a branch for the release,
-    you need to go back to main before executing this.
+9. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
 """

 import os
--- a/utils/check_table.py
+++ b/utils/check_table.py
@ -12,11 +12,30 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Utility that checks the big table in the file docs/source/en/index.md and potentially updates it.

+Use from the root of the repo with:
+
+```bash
+python utils/check_inits.py
+```
+
+for a check that will error in case of inconsistencies (used by `make repo-consistency`).
+
+To auto-fix issues run:
+
+```bash
+python utils/check_inits.py --fix_and_overwrite
+```
+
+which is used by `make fix-copies`.
+"""
 import argparse
 import collections
 import os
 import re
+from typing import List

 from transformers.utils import direct_transformers_import

@ -28,19 +47,28 @@ PATH_TO_DOCS = "docs/source/en"
 REPO_PATH = "."


-def _find_text_in_file(filename, start_prompt, end_prompt):
+def _find_text_in_file(filename: str, start_prompt: str, end_prompt: str) -> str:
    """
-    Find the text in `filename` between a line beginning with `start_prompt` and before `end_prompt`, removing empty
-    lines.
+    Find the text in filename between two prompts.
+
+    Args:
+        filename (`str`): The file to search into.
+        start_prompt (`str`): A string to look for at the start of the content searched.
+        end_prompt (`str`): A string that will mark the end of the content to look for.
+
+    Returns:
+        `str`: The content between the prompts.
    """
    with open(filename, "r", encoding="utf-8", newline="\n") as f:
        lines = f.readlines()
+
    # Find the start prompt.
    start_index = 0
    while not lines[start_index].startswith(start_prompt):
        start_index += 1
    start_index += 1

+    # Now go until the end prompt.
    end_index = start_index
    while not lines[end_index].startswith(end_prompt):
        end_index += 1
@ -54,12 +82,10 @@ def _find_text_in_file(filename, start_prompt, end_prompt):
    return "".join(lines[start_index:end_index]), start_index, end_index, lines


-# Add here suffixes that are used to identify models, separated by |
-ALLOWED_MODEL_SUFFIXES = "Model|Encoder|Decoder|ForConditionalGeneration"
-# Regexes that match TF/Flax/PT model names.
+# Regexes that match TF/Flax/PT model names. Add here suffixes that are used to identify models, separated by |
 _re_tf_models = re.compile(r"TF(.*)(?:Model|Encoder|Decoder|ForConditionalGeneration)")
 _re_flax_models = re.compile(r"Flax(.*)(?:Model|Encoder|Decoder|ForConditionalGeneration)")
-# Will match any TF or Flax model too so need to be in an else branch afterthe two previous regexes.
+# Will match any TF or Flax model too so need to be in an else branch after the two previous regexes.
 _re_pt_models = re.compile(r"(.*)(?:Model|Encoder|Decoder|ForConditionalGeneration)")


@ -67,22 +93,49 @@ _re_pt_models = re.compile(r"(.*)(?:Model|Encoder|Decoder|ForConditionalGenerati
 transformers_module = direct_transformers_import(TRANSFORMERS_PATH)


-# Thanks to https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python
-def camel_case_split(identifier):
-    "Split a camelcased `identifier` into words."
+def camel_case_split(identifier: str) -> List[str]:
+    """
+    Split a camel-cased name into words.
+
+    Args:
+        identifier (`str`): The camel-cased name to parse.
+
+    Returns:
+        `List[str]`: The list of words in the identifier (as seprated by capital letters).
+
+    Example:
+
+    ```py
+    >>> camel_case_split("CamelCasedClass")
+    ["Camel", "Cased", "Class"]
+    ```
+    """
+    # Regex thanks to https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python
    matches = re.finditer(".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier)
    return [m.group(0) for m in matches]


-def _center_text(text, width):
+def _center_text(text: str, width: int) -> str:
+    """
+    Utility that will add spaces on the left and right of a text to make it centered for a given width.
+
+    Args:
+        text (`str`): The text to center.
+        width (`int`): The desired length of the result.
+
+    Returns:
+        `str`: A text of length `width` with the original `text` in the middle.
+    """
    text_length = 2 if text == "✅" or text == "❌" else len(text)
    left_indent = (width - text_length) // 2
    right_indent = width - text_length - left_indent
    return " " * left_indent + text + " " * right_indent


-def get_model_table_from_auto_modules():
-    """Generates an up-to-date model table from the content of the auto modules."""
+def get_model_table_from_auto_modules() -> str:
+    """
+    Generates an up-to-date model table from the content of the auto modules.
+    """
    # Dictionary model names to config.
    config_maping_names = transformers_module.models.auto.configuration_auto.CONFIG_MAPPING_NAMES
    model_name_to_config = {
@ -92,7 +145,7 @@ def get_model_table_from_auto_modules():
    }
    model_name_to_prefix = {name: config.replace("Config", "") for name, config in model_name_to_config.items()}

-    # Dictionaries flagging if each model prefix has a slow/fast tokenizer, backend in PT/TF/Flax.
+    # Dictionaries flagging if each model prefix has a backend in PT/TF/Flax.
    pt_models = collections.defaultdict(bool)
    tf_models = collections.defaultdict(bool)
    flax_models = collections.defaultdict(bool)
@ -145,7 +198,13 @@ def get_model_table_from_auto_modules():


 def check_model_table(overwrite=False):
-    """Check the model table in the index.rst is consistent with the state of the lib and maybe `overwrite`."""
+    """
+    Check the model table in the index.md is consistent with the state of the lib and potentially fix it.
+
+    Args:
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the table when it's not up to date.
+    """
    current_table, start_index, end_index, lines = _find_text_in_file(
        filename=os.path.join(PATH_TO_DOCS, "index.md"),
        start_prompt="<!--This table is updated automatically from the auto modules",
--- a/utils/check_task_guides.py
+++ b/utils/check_task_guides.py
@ -12,7 +12,26 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Utility that checks the list of models in the tips in the task-specific pages of the doc is up to date and potentially
+fixes it.

+Use from the root of the repo with:
+
+```bash
+python utils/check_task_guides.py
+```
+
+for a check that will error in case of inconsistencies (used by `make repo-consistency`).
+
+To auto-fix issues run:
+
+```bash
+python utils/check_task_guides.py --fix_and_overwrite
+```
+
+which is used by `make fix-copies`.
+"""
 import argparse
 import os

@ -25,10 +44,17 @@ TRANSFORMERS_PATH = "src/transformers"
 PATH_TO_TASK_GUIDES = "docs/source/en/tasks"


-def _find_text_in_file(filename, start_prompt, end_prompt):
+def _find_text_in_file(filename: str, start_prompt: str, end_prompt: str) -> str:
    """
-    Find the text in `filename` between a line beginning with `start_prompt` and before `end_prompt`, removing empty
-    lines.
+    Find the text in filename between two prompts.
+
+    Args:
+        filename (`str`): The file to search into.
+        start_prompt (`str`): A string to look for at the start of the content searched.
+        end_prompt (`str`): A string that will mark the end of the content to look for.
+
+    Returns:
+        `str`: The content between the prompts.
    """
    with open(filename, "r", encoding="utf-8", newline="\n") as f:
        lines = f.readlines()
@ -38,6 +64,7 @@ def _find_text_in_file(filename, start_prompt, end_prompt):
        start_index += 1
    start_index += 1

+    # Now go until the end prompt.
    end_index = start_index
    while not lines[end_index].startswith(end_prompt):
        end_index += 1
@ -54,6 +81,7 @@ def _find_text_in_file(filename, start_prompt, end_prompt):
 # This is to make sure the transformers module imported is the one in the repo.
 transformers_module = direct_transformers_import(TRANSFORMERS_PATH)

+# Map between a task guide and the corresponding auto class.
 TASK_GUIDE_TO_MODELS = {
    "asr.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_CTC_MAPPING_NAMES,
    "audio_classification.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
@ -81,9 +109,15 @@ SPECIAL_TASK_GUIDE_TO_MODEL_TYPES = {
 }


-def get_model_list_for_task(task_guide):
+def get_model_list_for_task(task_guide: str) -> str:
    """
-    Return the list of models supporting given task.
+    Return the list of models supporting a given task.
+
+    Args:
+        task_guide (`str`): The name of the task guide to check.
+
+    Returns:
+        `str`: The list of models supporting this task, as links to their respective doc pages separated by commas.
    """
    model_maping_names = TASK_GUIDE_TO_MODELS[task_guide]
    special_model_types = SPECIAL_TASK_GUIDE_TO_MODEL_TYPES.get(task_guide, set())
@ -95,9 +129,17 @@ def get_model_list_for_task(task_guide):
    return ", ".join([f"[{name}](../model_doc/{code})" for code, name in model_names.items()]) + "\n"


-def check_model_list_for_task(task_guide, overwrite=False):
-    """For a given task guide, checks the model list in the generated tip for consistency with the state of the lib and overwrites if needed."""
+def check_model_list_for_task(task_guide: str, overwrite: bool = False):
+    """
+    For a given task guide, checks the model list in the generated tip for consistency with the state of the lib and
+    updates it if needed.

+    Args:
+        task_guide (`str`):
+            The name of the task guide to check.
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the table when it's not up to date.
+    """
    current_list, start_index, end_index, lines = _find_text_in_file(
        filename=os.path.join(PATH_TO_TASK_GUIDES, task_guide),
        start_prompt="<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->",
--- a/utils/custom_init_isort.py
+++ b/utils/custom_init_isort.py
@ -12,12 +12,35 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Utility that sorts the imports in the custom inits of Transformers. Transformers uses init files that delay the
+import of an object to when it's actually needed. This is to avoid the main init importing all models, which would
+make the line `import transformers` very slow when the user has all optional dependencies installed. The inits with
+delayed imports have two halves: one definining a dictionary `_import_structure` which maps modules to the name of the
+objects in each module, and one in `TYPE_CHECKING` which looks like a normal init for type-checkers. `isort` or `ruff`
+properly sort the second half which looks like traditionl imports, the goal of this script is to sort the first half.

+Use from the root of the repo with:
+
+```bash
+python utils/custom_init_isort.py
+```
+
+which will auto-sort the imports (used in `make style`).
+
+For a check only (as used in `make quality`) run:
+
+```bash
+python utils/custom_init_isort.py --check_only
+```
+"""
 import argparse
 import os
 import re
+from typing import Any, Callable, List, Optional


+# Path is defined with the intent you should run this script from the root of the repo.
 PATH_TO_TRANSFORMERS = "src/transformers"

 # Pattern that looks at the indentation in a line.
@ -32,17 +55,30 @@ _re_strip_line = re.compile(r'^\s*"([^"]+)",\s*$')
 _re_bracket_content = re.compile(r"\[([^\]]+)\]")


-def get_indent(line):
-    """Returns the indent in `line`."""
+def get_indent(line: str) -> str:
+    """Returns the indent in  given line (as string)."""
    search = _re_indent.search(line)
    return "" if search is None else search.groups()[0]


-def split_code_in_indented_blocks(code, indent_level="", start_prompt=None, end_prompt=None):
+def split_code_in_indented_blocks(
+    code: str, indent_level: str = "", start_prompt: Optional[str] = None, end_prompt: Optional[str] = None
+) -> List[str]:
    """
-    Split `code` into its indented blocks, starting at `indent_level`. If provided, begins splitting after
-    `start_prompt` and stops at `end_prompt` (but returns what's before `start_prompt` as a first block and what's
-    after `end_prompt` as a last block, so `code` is always the same as joining the result of this function).
+    Split some code into its indented blocks, starting at a given level.
+
+    Args:
+        code (`str`): The code to split.
+        indent_level (`str`): The indent level (as string) to use for identifying the blocks to split.
+        start_prompt (`str`, *optional*): If provided, only starts splitting at the line where this text is.
+        end_prompt (`str`, *optional*): If provided, stops splitting at a line where this text is.
+
+    Warning:
+        The text before `start_prompt` or after `end_prompt` (if provided) is not ignored, just not split. The input `code`
+        can thus be retrieved by joining the result.
+
+    Returns:
+        `List[str]`: The list of blocks.
    """
    # Let's split the code into lines and move to start_index.
    index = 0
@ -54,12 +90,17 @@ def split_code_in_indented_blocks(code, indent_level="", start_prompt=None, end_
    else:
        blocks = []

-    # We split into blocks until we get to the `end_prompt` (or the end of the block).
+    # This variable contains the block treated at a given time.
    current_block = [lines[index]]
    index += 1
+    # We split into blocks until we get to the `end_prompt` (or the end of the file).
    while index < len(lines) and (end_prompt is None or not lines[index].startswith(end_prompt)):
+        # We have a non-empty line with the proper indent -> start of a new block
        if len(lines[index]) > 0 and get_indent(lines[index]) == indent_level:
+            # Store the current block in the result and rest. There are two cases: the line is part of the block (like
+            # a closing parenthesis) or not.
            if len(current_block) > 0 and get_indent(current_block[-1]).startswith(indent_level + " "):
+                # Line is part of the current block
                current_block.append(lines[index])
                blocks.append("\n".join(current_block))
                if index < len(lines) - 1:
@ -68,9 +109,11 @@ def split_code_in_indented_blocks(code, indent_level="", start_prompt=None, end_
                else:
                    current_block = []
            else:
+                # Line is not part of the current block
                blocks.append("\n".join(current_block))
                current_block = [lines[index]]
        else:
+            # Just add the line to the current block
            current_block.append(lines[index])
        index += 1

@ -85,8 +128,10 @@ def split_code_in_indented_blocks(code, indent_level="", start_prompt=None, end_
    return blocks


-def ignore_underscore(key):
-    "Wraps a `key` (that maps an object to string) to lower case and remove underscores."
+def ignore_underscore_and_lowercase(key: Callable[[Any], str]) -> Callable[[Any], str]:
+    """
+    Wraps a key function (as used in a sort) to lowercase and ignore underscores.
+    """

    def _inner(x):
        return key(x).lower().replace("_", "")
@ -94,8 +139,21 @@ def ignore_underscore(key):
    return _inner


-def sort_objects(objects, key=None):
-    "Sort a list of `objects` following the rules of isort. `key` optionally maps an object to a str."
+def sort_objects(objects: List[Any], key: Optional[Callable[[Any], str]] = None) -> List[Any]:
+    """
+    Sort a list of objects following the rules of isort (all uppercased first, camel-cased second and lower-cased
+    last).
+
+    Args:
+        objects (`List[Any]`):
+            The list of objects to sort.
+        key (`Callable[[Any], str]`, *optional*):
+            A function taking an object as input and returning a string, used to sort them by alphabetical order.
+            If not provided, will default to noop (so a `key` must be provided if the `objects` are not of type string).
+
+    Returns:
+        `List[Any]`: The sorted list with the same elements as in the inputs
+    """

    # If no key is provided, we use a noop.
    def noop(x):
@ -110,18 +168,26 @@ def sort_objects(objects, key=None):
    # Functions begin with a lowercase, they go last.
    functions = [obj for obj in objects if not key(obj)[0].isupper()]

-    key1 = ignore_underscore(key)
+    # Then we sort each group.
+    key1 = ignore_underscore_and_lowercase(key)
    return sorted(constants, key=key1) + sorted(classes, key=key1) + sorted(functions, key=key1)


-def sort_objects_in_import(import_statement):
+def sort_objects_in_import(import_statement: str) -> str:
    """
-    Return the same `import_statement` but with objects properly sorted.
+    Sorts the imports in a single import statement.
+
+    Args:
+        import_statement (`str`): The import statement in which to sort the imports.
+
+    Returns:
+        `str`: The same as the input, but with objects properly sorted.
    """

    # This inner function sort imports between [ ].
    def _replace(match):
        imports = match.groups()[0]
+        # If there is one import only, nothing to do.
        if "," not in imports:
            return f"[{imports}]"
        keys = [part.strip().replace('"', "") for part in imports.split(",")]
@ -165,13 +231,18 @@ def sort_objects_in_import(import_statement):
        return import_statement


-def sort_imports(file, check_only=True):
+def sort_imports(file: str, check_only: bool = True):
    """
-    Sort `_import_structure` imports in `file`, `check_only` determines if we only check or overwrite.
+    Sort the imports defined in the `_import_structure` of a given init.
+
+    Args:
+        file (`str`): The path to the init to check/fix.
+        check_only (`bool`, *optional*, defaults to `True`): Whether or not to just check (and not auto-fix) the init.
    """
    with open(file, encoding="utf-8") as f:
        code = f.read()

+    # If the file is not a custom init, there is nothing to do.
    if "_import_structure" not in code:
        return

@ -234,6 +305,12 @@ def sort_imports(file, check_only=True):


 def sort_imports_in_all_inits(check_only=True):
+    """
+    Sort the imports defined in the `_import_structure` of all inits in the repo.
+
+    Args:
+        check_only (`bool`, *optional*, defaults to `True`): Whether or not to just check (and not auto-fix) the init.
+    """
    failures = []
    for root, _, files in os.walk(PATH_TO_TRANSFORMERS):
        if "__init__.py" in files:
--- a/utils/release.py
+++ b/utils/release.py
@ -12,7 +12,35 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Utility that prepares the repository for releases (or patches) by updating all versions in the relevant places. It
+also performs some post-release cleanup, by updating the links in the main README to respective model doc pages (from
+main to stable).

+To prepare for a release, use from the root of the repo on the release branch with:
+
+```bash
+python release.py
+```
+
+or use `make pre-release`.
+
+To prepare for a patch release, use from the root of the repo on the release branch with:
+
+```bash
+python release.py --patch
+```
+
+or use `make pre-patch`.
+
+To do the post-release cleanup, use from the root of the repo on the main branch with:
+
+```bash
+python release.py --post_release
+```
+
+or use `make post-release`.
+"""
 import argparse
 import os
 import re
@ -20,13 +48,16 @@ import re
 import packaging.version


+# All paths are defined with the intent that this script should be run from the root of the repo.
 PATH_TO_EXAMPLES = "examples/"
+# This maps a type of file to the pattern to look for when searching where the version is defined, as well as the
+# template to follow when replacing it with the new version.
 REPLACE_PATTERNS = {
    "examples": (re.compile(r'^check_min_version\("[^"]+"\)\s*$', re.MULTILINE), 'check_min_version("VERSION")\n'),
    "init": (re.compile(r'^__version__\s+=\s+"([^"]+)"\s*$', re.MULTILINE), '__version__ = "VERSION"\n'),
    "setup": (re.compile(r'^(\s*)version\s*=\s*"[^"]+",', re.MULTILINE), r'\1version="VERSION",'),
-    "doc": (re.compile(r'^(\s*)release\s*=\s*"[^"]+"$', re.MULTILINE), 'release = "VERSION"\n'),
 }
+# This maps a type of file to its path in Transformers
 REPLACE_FILES = {
    "init": "src/transformers/__init__.py",
    "setup": "setup.py",
@ -34,19 +65,31 @@ REPLACE_FILES = {
 README_FILE = "README.md"


-def update_version_in_file(fname, version, pattern):
-    """Update the version in one file using a specific pattern."""
+def update_version_in_file(fname: str, version: str, file_type: str):
+    """
+    Update the version of Transformers in one file.
+
+    Args:
+        fname (`str`): The path to the file where we want to update the version.
+        version (`str`): The new version to set in the file.
+        file_type (`str`): The type of the file (should be a key in `REPLACE_PATTERNS`).
+    """
    with open(fname, "r", encoding="utf-8", newline="\n") as f:
        code = f.read()
-    re_pattern, replace = REPLACE_PATTERNS[pattern]
+    re_pattern, replace = REPLACE_PATTERNS[file_type]
    replace = replace.replace("VERSION", version)
    code = re_pattern.sub(replace, code)
    with open(fname, "w", encoding="utf-8", newline="\n") as f:
        f.write(code)


-def update_version_in_examples(version):
-    """Update the version in all examples files."""
+def update_version_in_examples(version: str):
+    """
+    Update the version in all examples files.
+
+    Args:
+        version (`str`): The new version to set in the examples.
+    """
    for folder, directories, fnames in os.walk(PATH_TO_EXAMPLES):
        # Removing some of the folders with non-actively maintained examples from the walk
        if "research_projects" in directories:
@ -55,19 +98,28 @@ def update_version_in_examples(version):
            directories.remove("legacy")
        for fname in fnames:
            if fname.endswith(".py"):
-                update_version_in_file(os.path.join(folder, fname), version, pattern="examples")
+                update_version_in_file(os.path.join(folder, fname), version, file_type="examples")


-def global_version_update(version, patch=False):
-    """Update the version in all needed files."""
+def global_version_update(version: str, patch: bool = False):
+    """
+    Update the version in all needed files.
+
+    Args:
+        version (`str`): The new version to set everywhere.
+        patch (`bool`, *optional*, defaults to `False`): Whether or not this is a patch release.
+    """
    for pattern, fname in REPLACE_FILES.items():
        update_version_in_file(fname, version, pattern)
    if not patch:
+        # We don't update the version in the examples for patch releases.
        update_version_in_examples(version)


 def clean_main_ref_in_model_list():
-    """Replace the links from main doc tp stable doc in the model list of the README."""
+    """
+    Replace the links from main doc to stable doc in the model list of the README.
+    """
    # If the introduction or the conclusion of the list change, the prompts may need to be updated.
    _start_prompt = "🤗 Transformers currently provides the following architectures"
    _end_prompt = "1. Want to contribute a new model?"
@ -94,16 +146,26 @@ def clean_main_ref_in_model_list():
        f.writelines(lines)


-def get_version():
-    """Reads the current version in the __init__."""
+def get_version() -> packaging.version.Version:
+    """
+    Reads the current version in the main __init__.
+    """
    with open(REPLACE_FILES["init"], "r") as f:
        code = f.read()
    default_version = REPLACE_PATTERNS["init"][0].search(code).groups()[0]
    return packaging.version.parse(default_version)


-def pre_release_work(patch=False):
-    """Do all the necessary pre-release steps."""
+def pre_release_work(patch: bool = False):
+    """
+    Do all the necessary pre-release steps:
+    - figure out the next minor release version and ask confirmation
+    - update the version eveywhere
+    - clean-up the model list in the main README
+
+    Args:
+        patch (`bool`, *optional*, defaults to `False`): Whether or not this is a patch release.
+    """
    # First let's get the default version: base version if we are in dev, bump minor otherwise.
    default_version = get_version()
    if patch and default_version.is_devrelease:
@ -115,7 +177,7 @@ def pre_release_work(patch=False):
    else:
        default_version = f"{default_version.major}.{default_version.minor + 1}.0"

-    # Now let's ask nicely if that's the right one.
+    # Now let's ask nicely if we have found the right version.
    version = input(f"Which version are you releasing? [{default_version}]")
    if len(version) == 0:
        version = default_version
@ -128,7 +190,12 @@ def pre_release_work(patch=False):


 def post_release_work():
-    """Do all the necesarry post-release steps."""
+    """
+    Do all the necesarry post-release steps:
+    - figure out the next dev version and ask confirmation
+    - update the version eveywhere
+    - clean-up the model list in the main README
+    """
    # First let's get the current version
    current_version = get_version()
    dev_version = f"{current_version.major}.{current_version.minor + 1}.0.dev0"
--- a/utils/sort_auto_mappings.py
+++ b/utils/sort_auto_mappings.py
@ -12,12 +12,30 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Utility that sorts the names in the auto mappings defines in the auto modules in alphabetical order.

+Use from the root of the repo with:
+
+```bash
+python utils/sort_auto_mappings.py
+```
+
+to auto-fix all the auto mappings (used in `make style`).
+
+To only check if the mappings are properly sorted (as used in `make quality`), do:
+
+```bash
+python utils/sort_auto_mappings.py --check_only
+```
+"""
 import argparse
 import os
 import re
+from typing import Optional


+# Path are set with the intent you should run this script from the root of the repo.
 PATH_TO_AUTO_MODULE = "src/transformers/models/auto"


@ -28,7 +46,18 @@ _re_intro_mapping = re.compile(r"[A-Z_]+_MAPPING(\s+|_[A-Z_]+\s+)=\s+OrderedDict
 _re_identifier = re.compile(r'\s*\(\s*"(\S[^"]+)"')


-def sort_auto_mapping(fname, overwrite: bool = False):
+def sort_auto_mapping(fname: str, overwrite: bool = False) -> Optional[bool]:
+    """
+    Sort all auto mappings in a file.
+
+    Args:
+        fname (`str`): The name of the file where we want to sort auto-mappings.
+        overwrite (`bool`, *optional*, defaults to `False`): Whether or not to fix and overwrite the file.
+
+    Returns:
+        `Optional[bool]`: Returns `None` if `overwrite=True`. Otherwise returns `True` if the file has an auto-mapping
+        improperly sorted, `False` if the file is okay.
+    """
    with open(fname, "r", encoding="utf-8") as f:
        content = f.read()

@ -37,8 +66,8 @@ def sort_auto_mapping(fname, overwrite: bool = False):
    line_idx = 0
    while line_idx < len(lines):
        if _re_intro_mapping.search(lines[line_idx]) is not None:
-            indent = len(re.search(r"^(\s*)\S", lines[line_idx]).groups()[0]) + 8
            # Start of a new mapping!
+            indent = len(re.search(r"^(\s*)\S", lines[line_idx]).groups()[0]) + 8
            while not lines[line_idx].startswith(" " * indent + "("):
                new_lines.append(lines[line_idx])
                line_idx += 1
@ -65,11 +94,17 @@ def sort_auto_mapping(fname, overwrite: bool = False):
    if overwrite:
        with open(fname, "w", encoding="utf-8") as f:
            f.write("\n".join(new_lines))
-    elif "\n".join(new_lines) != content:
-        return True
+    else:
+        return "\n".join(new_lines) != content


 def sort_all_auto_mappings(overwrite: bool = False):
+    """
+    Sort all auto mappings in the library.
+
+    Args:
+        overwrite (`bool`, *optional*, defaults to `False`): Whether or not to fix and overwrite the file.
+    """
    fnames = [os.path.join(PATH_TO_AUTO_MODULE, f) for f in os.listdir(PATH_TO_AUTO_MODULE) if f.endswith(".py")]
    diffs = [sort_auto_mapping(fname, overwrite=overwrite) for fname in fnames]

--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@ -15,6 +15,7 @@

 """
 Welcome to tests_fetcher V2.
+
 This util is designed to fetch tests to run on a PR so that only the tests impacted by the modifications are run, and
 when too many models are being impacted, only run the tests of a subset of core models. It works like this.

--- a/utils/update_metadata.py
+++ b/utils/update_metadata.py
@ -12,12 +12,28 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Utility that updates the metadata of the Transformers library in the repository `huggingface/transformers-metadata`.

+Usage for an update (as used by the GitHub action `update_metadata`):
+
+```bash
+python utils/update_metadata.py --token <token> --commit_sha <commit_sha>
+```
+
+Usage to check all pipelines are properly defined in the constant `PIPELINE_TAGS_AND_AUTO_MODELS` of this script, so
+that new pipelines are properly added as metadata (as used in `make repo-consistency`):
+
+```bash
+python utils/update_metadata.py --check-only
+```
+"""
 import argparse
 import collections
 import os
 import re
 import tempfile
+from typing import Dict, List, Tuple

 import pandas as pd
 from datasets import Dataset
@ -102,14 +118,29 @@ PIPELINE_TAGS_AND_AUTO_MODELS = [
 ]


-# Thanks to https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python
-def camel_case_split(identifier):
-    "Split a camelcased `identifier` into words."
+def camel_case_split(identifier: str) -> List[str]:
+    """
+    Split a camel-cased name into words.
+
+    Args:
+        identifier (`str`): The camel-cased name to parse.
+
+    Returns:
+        `List[str]`: The list of words in the identifier (as seprated by capital letters).
+
+    Example:
+
+    ```py
+    >>> camel_case_split("CamelCasedClass")
+    ["Camel", "Cased", "Class"]
+    ```
+    """
+    # Regex thanks to https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python
    matches = re.finditer(".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier)
    return [m.group(0) for m in matches]


-def get_frameworks_table():
+def get_frameworks_table() -> pd.DataFrame:
    """
    Generates a dataframe containing the supported auto classes for each model type, using the content of the auto
    modules.
@ -155,7 +186,8 @@ def get_frameworks_table():
    data["tensorflow"] = [tf_models[t] for t in all_models]
    data["flax"] = [flax_models[t] for t in all_models]

-    # Now let's use the auto-mapping names to make sure
+    # Now let's find the right processing class for each model. In order we check if there is a Processor, then a
+    # Tokenizer, then a FeatureExtractor, then an ImageProcessor
    processors = {}
    for t in all_models:
        if t in transformers_module.models.auto.processing_auto.PROCESSOR_MAPPING_NAMES:
@ -164,6 +196,8 @@ def get_frameworks_table():
            processors[t] = "AutoTokenizer"
        elif t in transformers_module.models.auto.feature_extraction_auto.FEATURE_EXTRACTOR_MAPPING_NAMES:
            processors[t] = "AutoFeatureExtractor"
+        elif t in transformers_module.models.auto.image_processing_auto.IMAGE_PROCESSOR_MAPPING_NAMES:
+            processors[t] = "AutoFeatureExtractor"
        else:
            # Default to AutoTokenizer if a model has nothing, for backward compatibility.
            processors[t] = "AutoTokenizer"
@ -173,10 +207,17 @@ def get_frameworks_table():
    return pd.DataFrame(data)


-def update_pipeline_and_auto_class_table(table):
+def update_pipeline_and_auto_class_table(table: Dict[str, Tuple[str, str]]) -> Dict[str, Tuple[str, str]]:
    """
-    Update the table of model class to (pipeline_tag, auto_class) without removing old keys if they don't exist
-    anymore.
+    Update the table maping models to pipelines and auto classes without removing old keys if they don't exist anymore.
+
+    Args:
+        table (`Dict[str, Tuple[str, str]]`):
+            The existing table mapping model names to a tuple containing the pipeline tag and the auto-class name with
+            which they should be used.
+
+    Returns:
+        `Dict[str, Tuple[str, str]]`: The updated table in the same format.
    """
    auto_modules = [
        transformers_module.models.auto.modeling_auto,
@ -205,9 +246,13 @@ def update_pipeline_and_auto_class_table(table):
    return table


-def update_metadata(token, commit_sha):
+def update_metadata(token: str, commit_sha: str):
    """
-    Update the metadata for the Transformers repo.
+    Update the metadata for the Transformers repo in `huggingface/transformers-metadata`.
+
+    Args:
+        token (`str`): A valid token giving write access to `huggingface/transformers-metadata`.
+        commit_sha (`str`): The commit SHA on Transformers corresponding to this update.
    """
    frameworks_table = get_frameworks_table()
    frameworks_dataset = Dataset.from_pandas(frameworks_table)
@ -255,6 +300,9 @@ def update_metadata(token, commit_sha):


 def check_pipeline_tags():
+    """
+    Check all pipeline tags are properly defined in the `PIPELINE_TAGS_AND_AUTO_MODELS` constant of this script.
+    """
    in_table = {tag: cls for tag, _, cls in PIPELINE_TAGS_AND_AUTO_MODELS}
    pipeline_tasks = transformers_module.pipelines.SUPPORTED_TASKS
    missing = []