Set usedforsecurity=False in hashlib methods (FIPS compliance) (#27483)

* Set usedforsecurity=False in hashlib methods (FIPS compliance)

* trigger ci

* tokenizers version

* deps

* bump hfh version

* let's try this
This commit is contained in:
Lucain 2023-11-16 15:29:53 +01:00 committed by GitHub
parent 5603fad247
commit fd65aa9818
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 21 additions and 19 deletions

View File

@ -1,5 +1,4 @@
import gzip
import hashlib
import json
import multiprocessing
import os
@ -11,6 +10,7 @@ from pathlib import Path
import numpy as np
from arguments import PreprocessingArguments
from datasets import load_dataset
from huggingface_hub.utils import insecure_hashlib
from minhash_deduplication import deduplicate_dataset
from transformers import AutoTokenizer, HfArgumentParser
@ -21,7 +21,7 @@ PATTERN = re.compile(r"\s+")
def get_hash(example):
"""Get hash of content field."""
return {"hash": hashlib.md5(re.sub(PATTERN, "", example["content"]).encode("utf-8")).hexdigest()}
return {"hash": insecure_hashlib.md5(re.sub(PATTERN, "", example["content"]).encode("utf-8")).hexdigest()}
def line_stats(example):

View File

@ -28,7 +28,6 @@ import tempfile
from collections import OrderedDict
from contextlib import contextmanager
from functools import partial
from hashlib import sha256
from io import BytesIO
from pathlib import Path
from urllib.parse import urlparse
@ -39,6 +38,7 @@ import numpy as np
import requests
import wget
from filelock import FileLock
from huggingface_hub.utils import insecure_hashlib
from PIL import Image
from tqdm.auto import tqdm
from yaml import Loader, dump, load
@ -402,12 +402,12 @@ def get_from_cache(
def url_to_filename(url, etag=None):
url_bytes = url.encode("utf-8")
url_hash = sha256(url_bytes)
url_hash = insecure_hashlib.sha256(url_bytes)
filename = url_hash.hexdigest()
if etag:
etag_bytes = etag.encode("utf-8")
etag_hash = sha256(etag_bytes)
etag_hash = insecure_hashlib.sha256(etag_bytes)
filename += "." + etag_hash.hexdigest()
if url.endswith(".h5"):

View File

@ -28,7 +28,6 @@ import tempfile
from collections import OrderedDict
from contextlib import contextmanager
from functools import partial
from hashlib import sha256
from io import BytesIO
from pathlib import Path
from urllib.parse import urlparse
@ -39,6 +38,7 @@ import numpy as np
import requests
import wget
from filelock import FileLock
from huggingface_hub.utils import insecure_hashlib
from PIL import Image
from tqdm.auto import tqdm
from yaml import Loader, dump, load
@ -402,12 +402,12 @@ def get_from_cache(
def url_to_filename(url, etag=None):
url_bytes = url.encode("utf-8")
url_hash = sha256(url_bytes)
url_hash = insecure_hashlib.sha256(url_bytes)
filename = url_hash.hexdigest()
if etag:
etag_bytes = etag.encode("utf-8")
etag_hash = sha256(etag_bytes)
etag_hash = insecure_hashlib.sha256(etag_bytes)
filename += "." + etag_hash.hexdigest()
if url.endswith(".h5"):

View File

@ -118,7 +118,7 @@ _deps = [
"fugashi>=1.0",
"GitPython<3.1.19",
"hf-doc-builder>=0.3.0",
"huggingface-hub>=0.16.4,<1.0",
"huggingface-hub>=0.19.3,<1.0",
"importlib_metadata",
"ipadic>=1.0.0,<2.0",
"isort>=5.5.4",
@ -321,6 +321,7 @@ extras["testing"] = (
"rjieba",
"beautifulsoup4",
"tensorboard",
"pydantic",
)
+ extras["retrieval"]
+ extras["modelcreation"]

View File

@ -25,7 +25,7 @@ deps = {
"fugashi": "fugashi>=1.0",
"GitPython": "GitPython<3.1.19",
"hf-doc-builder": "hf-doc-builder>=0.3.0",
"huggingface-hub": "huggingface-hub>=0.16.4,<1.0",
"huggingface-hub": "huggingface-hub>=0.19.3,<1.0",
"importlib_metadata": "importlib_metadata",
"ipadic": "ipadic>=1.0.0,<2.0",
"isort": "isort>=5.5.4",

View File

@ -15,7 +15,6 @@
# limitations under the License.
import argparse
import hashlib
import io
import json
import os
@ -24,6 +23,7 @@ import urllib
import warnings
import torch
from huggingface_hub.utils import insecure_hashlib
from torch import nn
from tqdm import tqdm
@ -114,7 +114,7 @@ def _download(url: str, root: str) -> io.BytesIO:
if os.path.isfile(download_target):
model_bytes = open(download_target, "rb").read()
if hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
if insecure_hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
return torch.load(io.BytesIO(model_bytes))
else:
warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
@ -132,7 +132,7 @@ def _download(url: str, root: str) -> io.BytesIO:
loop.update(len(buffer))
model_bytes = open(download_target, "rb").read()
if hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
if insecure_hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
raise RuntimeError(
"Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model."
)

View File

@ -12,9 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import hashlib
import unittest
from huggingface_hub.utils import insecure_hashlib
from transformers import MODEL_FOR_DEPTH_ESTIMATION_MAPPING, is_torch_available, is_vision_available
from transformers.pipelines import DepthEstimationPipeline, pipeline
from transformers.testing_utils import (
@ -44,7 +45,7 @@ else:
def hashimage(image: Image) -> str:
m = hashlib.md5(image.tobytes())
m = insecure_hashlib.md5(image.tobytes())
return m.hexdigest()

View File

@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import hashlib
import tempfile
import unittest
from typing import Dict
@ -21,6 +20,7 @@ import datasets
import numpy as np
import requests
from datasets import load_dataset
from huggingface_hub.utils import insecure_hashlib
from transformers import (
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
@ -59,7 +59,7 @@ else:
def hashimage(image: Image) -> str:
m = hashlib.md5(image.tobytes())
m = insecure_hashlib.md5(image.tobytes())
return m.hexdigest()[:10]

View File

@ -12,11 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import hashlib
import unittest
from typing import Dict
import numpy as np
from huggingface_hub.utils import insecure_hashlib
from transformers import (
MODEL_FOR_MASK_GENERATION_MAPPING,
@ -46,7 +46,7 @@ else:
def hashimage(image: Image) -> str:
m = hashlib.md5(image.tobytes())
m = insecure_hashlib.md5(image.tobytes())
return m.hexdigest()[:10]