Check TRUST_REMOTE_CODE for RealmRetriever for security (#36511)

* fix

* repush

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
Yih-Dar 2025-03-03 15:08:12 +01:00 committed by GitHub
parent 4d8259d245
commit 9e3a1072c2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -21,7 +21,7 @@ import numpy as np
from huggingface_hub import hf_hub_download
from .... import AutoTokenizer
from ....utils import logging
from ....utils import logging, strtobool
_REALM_BLOCK_RECORDS_FILENAME = "block_records.npy"
@ -114,6 +114,14 @@ class RealmRetriever:
block_records_path = hf_hub_download(
repo_id=pretrained_model_name_or_path, filename=_REALM_BLOCK_RECORDS_FILENAME, **kwargs
)
if not strtobool(os.environ.get("TRUST_REMOTE_CODE", "False")):
raise ValueError(
"This part uses `pickle.load` which is insecure and will execute arbitrary code that is "
"potentially malicious. It's recommended to never unpickle data that could have come from an "
"untrusted source, or that could have been tampered with. If you already verified the pickle "
"data and decided to use it, you can set the environment variable "
"`TRUST_REMOTE_CODE` to `True` to allow it."
)
block_records = np.load(block_records_path, allow_pickle=True)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs)