mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-04 05:10:06 +06:00

This is the result of: $ black --line-length 119 examples templates transformers utils hubconf.py setup.py There's a lot of fairly long lines in the project. As a consequence, I'm picking the longest widely accepted line length, 119 characters. This is also Thomas' preference, because it allows for explicit variable names, to make the code easier to understand.
80 lines
2.1 KiB
Python
80 lines
2.1 KiB
Python
""" Link tester.
|
|
|
|
This little utility reads all the python files in the repository,
|
|
scans for links pointing to S3 and tests the links one by one. Raises an error
|
|
at the end of the scan if at least one link was reported broken.
|
|
"""
|
|
import os
|
|
import re
|
|
import sys
|
|
|
|
import requests
|
|
|
|
|
|
REGEXP_FIND_S3_LINKS = r"""([\"'])(https:\/\/s3)(.*)?\1"""
|
|
|
|
|
|
def list_python_files_in_repository():
|
|
""" List all python files in the repository.
|
|
|
|
This function assumes that the script is executed in the root folder.
|
|
"""
|
|
source_code_files = []
|
|
for path, subdirs, files in os.walk("."):
|
|
if "templates" in path:
|
|
continue
|
|
for name in files:
|
|
if ".py" in name and ".pyc" not in name:
|
|
path_to_files = os.path.join(path, name)
|
|
source_code_files.append(path_to_files)
|
|
|
|
return source_code_files
|
|
|
|
|
|
def find_all_links(file_paths):
|
|
links = []
|
|
for path in file_paths:
|
|
links += scan_code_for_links(path)
|
|
|
|
return links
|
|
|
|
|
|
def scan_code_for_links(source):
|
|
""" Scans the file to find links using a regular expression.
|
|
Returns a list of links.
|
|
"""
|
|
with open(source, "r") as content:
|
|
content = content.read()
|
|
raw_links = re.findall(REGEXP_FIND_S3_LINKS, content)
|
|
links = [prefix + suffix for _, prefix, suffix in raw_links]
|
|
|
|
return links
|
|
|
|
|
|
def check_all_links(links):
|
|
""" Check that the provided links are valid.
|
|
|
|
Links are considered valid if a HEAD request to the server
|
|
returns a 200 status code.
|
|
"""
|
|
broken_links = []
|
|
for link in links:
|
|
head = requests.head(link)
|
|
if head.status_code != 200:
|
|
broken_links.append(link)
|
|
|
|
return broken_links
|
|
|
|
|
|
if __name__ == "__main__":
|
|
file_paths = list_python_files_in_repository()
|
|
links = find_all_links(file_paths)
|
|
broken_links = check_all_links(links)
|
|
print("Looking for broken links to pre-trained models/configs/tokenizers...")
|
|
if broken_links:
|
|
print("The following links did not respond:")
|
|
for link in broken_links:
|
|
print("- {}".format(link))
|
|
sys.exit(1)
|
|
print("All links are ok.")
|