mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-03 21:00:08 +06:00
check the validity of links
We add a script and a CI workflow to check that all download links present in the source code are valid.
This commit is contained in:
parent
35ff345fc9
commit
f230d91b43
@ -82,6 +82,16 @@ jobs:
|
||||
- run: sudo pip install --progress-bar off -r docs/requirements.txt
|
||||
- run: sudo pip install --progress-bar off -r requirements.txt
|
||||
- run: ./.circleci/deploy.sh
|
||||
repository_consistency:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:3.5
|
||||
resource_class: small
|
||||
parallelism: 1
|
||||
steps:
|
||||
- checkout
|
||||
- run: sudo pip install requests
|
||||
- run: python ./utils/link_tester.py
|
||||
workflow_filters: &workflow_filters
|
||||
filters:
|
||||
branches:
|
||||
@ -91,6 +101,7 @@ workflows:
|
||||
version: 2
|
||||
build_and_test:
|
||||
jobs:
|
||||
- repository_consistency
|
||||
- build_py3_torch_and_tf
|
||||
- build_py3_torch
|
||||
- build_py3_tf
|
||||
|
79
utils/link_tester.py
Normal file
79
utils/link_tester.py
Normal file
@ -0,0 +1,79 @@
|
||||
""" Link tester.
|
||||
|
||||
This little utility reads all the python files in the repository,
|
||||
scans for links pointing to S3 and tests the links one by one. Raises an error
|
||||
at the end of the scan if at least one link was reported broken.
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
REGEXP_FIND_S3_LINKS = r"""([\"'])(https:\/\/s3)(.*)?\1"""
|
||||
|
||||
|
||||
def list_python_files_in_repository():
|
||||
""" List all python files in the repository.
|
||||
|
||||
This function assumes that the script is executed in the root folder.
|
||||
"""
|
||||
source_code_files = []
|
||||
for path, subdirs, files in os.walk("."):
|
||||
if "templates" in path:
|
||||
continue
|
||||
for name in files:
|
||||
if ".py" in name and ".pyc" not in name:
|
||||
path_to_files = os.path.join(path, name)
|
||||
source_code_files.append(path_to_files)
|
||||
|
||||
return source_code_files
|
||||
|
||||
|
||||
def find_all_links(file_paths):
|
||||
links = []
|
||||
for path in file_paths:
|
||||
links += scan_code_for_links(path)
|
||||
|
||||
return links
|
||||
|
||||
|
||||
def scan_code_for_links(source):
|
||||
""" Scans the file to find links using a regular expression.
|
||||
Returns a list of links.
|
||||
"""
|
||||
with open(source, 'r') as content:
|
||||
content = content.read()
|
||||
raw_links = re.findall(REGEXP_FIND_S3_LINKS, content)
|
||||
links = [prefix + suffix for _, prefix, suffix in raw_links]
|
||||
|
||||
return links
|
||||
|
||||
|
||||
def check_all_links(links):
|
||||
""" Check that the provided links are valid.
|
||||
|
||||
Links are considered valid if a HEAD request to the server
|
||||
returns a 200 status code.
|
||||
"""
|
||||
broken_links = []
|
||||
for link in links:
|
||||
head = requests.head(link)
|
||||
if head.status_code != 200:
|
||||
broken_links.append(link)
|
||||
|
||||
return broken_links
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
file_paths = list_python_files_in_repository()
|
||||
links = find_all_links(file_paths)
|
||||
broken_links = check_all_links(links)
|
||||
print("Looking for broken links to pre-trained models/configs/tokenizers...")
|
||||
if broken_links:
|
||||
print("The following links did not respond:")
|
||||
for link in broken_links:
|
||||
print("- {}".format(link))
|
||||
sys.exit(1)
|
||||
print("All links are ok.")
|
Loading…
Reference in New Issue
Block a user