From ede5e041911afed37c8284a980342d4a2625b1d5 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Sat, 30 Apr 2022 10:40:46 +0200 Subject: [PATCH] Add a check on config classes docstring checkpoints (#17012) * Add the check * add missing ckpts * add a list to ignore * call the added check script * better regex pattern Co-authored-by: ydshieh --- .circleci/config.yml | 1 + Makefile | 1 + docs/source/en/pr_checks.mdx | 1 + .../models/convbert/configuration_convbert.py | 8 +- .../models/imagegpt/configuration_imagegpt.py | 2 +- utils/check_config_docstrings.py | 84 +++++++++++++++++++ 6 files changed, 93 insertions(+), 4 deletions(-) create mode 100644 utils/check_config_docstrings.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 869406fc0b6..c74a26f8a9d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -881,6 +881,7 @@ jobs: - run: python utils/check_dummies.py - run: python utils/check_repo.py - run: python utils/check_inits.py + - run: python utils/check_config_docstrings.py - run: make deps_table_check_updated - run: python utils/tests_fetcher.py --sanity_check diff --git a/Makefile b/Makefile index 424d8798c9e..143be675b52 100644 --- a/Makefile +++ b/Makefile @@ -39,6 +39,7 @@ repo-consistency: python utils/check_dummies.py python utils/check_repo.py python utils/check_inits.py + python utils/check_config_docstrings.py python utils/tests_fetcher.py --sanity_check # this target runs checks on all files diff --git a/docs/source/en/pr_checks.mdx b/docs/source/en/pr_checks.mdx index e44b9be59d9..57e0766c7f6 100644 --- a/docs/source/en/pr_checks.mdx +++ b/docs/source/en/pr_checks.mdx @@ -108,6 +108,7 @@ This checks that: - All objects added to the init are documented (performed by `utils/check_repo.py`) - All `__init__.py` files have the same content in their two sections (performed by `utils/check_inits.py`) - All code identified as a copy from another module is consistent with the original (performed by `utils/check_copies.py`) +- All configuration classes have at least one valid checkpoint mentioned in their docstrings (performed by `utils/check_config_docstrings.py`) - The translations of the READMEs and the index of the doc have the same model list as the main README (performed by `utils/check_copies.py`) - The auto-generated tables in the documentation are up to date (performed by `utils/check_table.py`) - The library has all objects available even if not all optional dependencies are installed (performed by `utils/check_dummies.py`) diff --git a/src/transformers/models/convbert/configuration_convbert.py b/src/transformers/models/convbert/configuration_convbert.py index c424326b2b9..5efa6018b60 100644 --- a/src/transformers/models/convbert/configuration_convbert.py +++ b/src/transformers/models/convbert/configuration_convbert.py @@ -37,9 +37,11 @@ class ConvBertConfig(PretrainedConfig): This is the configuration class to store the configuration of a [`ConvBertModel`]. It is used to instantiate an ConvBERT model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the ConvBERT - [conv-bert-base](https://huggingface.co/YituTech/conv-bert-base) architecture. Configuration objects inherit from - [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] - for more information. + [YituTech/conv-bert-base](https://huggingface.co/YituTech/conv-bert-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + Args: vocab_size (`int`, *optional*, defaults to 30522): diff --git a/src/transformers/models/imagegpt/configuration_imagegpt.py b/src/transformers/models/imagegpt/configuration_imagegpt.py index d52414abfd3..e9cf1d910d9 100644 --- a/src/transformers/models/imagegpt/configuration_imagegpt.py +++ b/src/transformers/models/imagegpt/configuration_imagegpt.py @@ -32,7 +32,7 @@ class ImageGPTConfig(PretrainedConfig): This is the configuration class to store the configuration of a [`ImageGPTModel`] or a [`TFImageGPTModel`]. It is used to instantiate a GPT-2 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the ImageGPT - [small](https://huggingface.co/imagegpt) architecture. + [openai/imagegpt-small](https://huggingface.co/openai/imagegpt-small) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. diff --git a/utils/check_config_docstrings.py b/utils/check_config_docstrings.py new file mode 100644 index 00000000000..382f42bfe15 --- /dev/null +++ b/utils/check_config_docstrings.py @@ -0,0 +1,84 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import inspect +import os +import re + + +# All paths are set with the intent you should run this script from the root of the repo with the command +# python utils/check_config_docstrings.py +PATH_TO_TRANSFORMERS = "src/transformers" + + +# This is to make sure the transformers module imported is the one in the repo. +spec = importlib.util.spec_from_file_location( + "transformers", + os.path.join(PATH_TO_TRANSFORMERS, "__init__.py"), + submodule_search_locations=[PATH_TO_TRANSFORMERS], +) +transformers = spec.loader.load_module() + +CONFIG_MAPPING = transformers.models.auto.configuration_auto.CONFIG_MAPPING + +# Regex pattern used to find the checkpoint mentioned in the docstring of `config_class`. +# For example, `[bert-base-uncased](https://huggingface.co/bert-base-uncased)` +_re_checkpoint = re.compile("\[(.+?)\]\((https://huggingface\.co/.+?)\)") + + +CONFIG_CLASSES_TO_IGNORE_FOR_DOCSTRING_CHECKPOINT_CHECK = { + "CLIPConfig", + "DecisionTransformerConfig", + "EncoderDecoderConfig", + "RagConfig", + "SpeechEncoderDecoderConfig", + "VisionEncoderDecoderConfig", + "VisionTextDualEncoderConfig", +} + + +def check_config_docstrings_have_checkpoints(): + configs_without_checkpoint = [] + + for config_class in list(CONFIG_MAPPING.values()): + checkpoint_found = False + + # source code of `config_class` + config_source = inspect.getsource(config_class) + checkpoints = _re_checkpoint.findall(config_source) + + for checkpoint in checkpoints: + # Each `checkpoint` is a tuple of a checkpoint name and a checkpoint link. + # For example, `('bert-base-uncased', 'https://huggingface.co/bert-base-uncased')` + ckpt_name, ckpt_link = checkpoint + + # verify the checkpoint name corresponds to the checkpoint link + ckpt_link_from_name = f"https://huggingface.co/{ckpt_name}" + if ckpt_link == ckpt_link_from_name: + checkpoint_found = True + break + + name = config_class.__name__ + if not checkpoint_found and name not in CONFIG_CLASSES_TO_IGNORE_FOR_DOCSTRING_CHECKPOINT_CHECK: + configs_without_checkpoint.append(name) + + if len(configs_without_checkpoint) > 0: + message = "\n".join(sorted(configs_without_checkpoint)) + raise ValueError(f"The following configurations don't contain any valid checkpoint:\n{message}") + + +if __name__ == "__main__": + check_config_docstrings_have_checkpoints()