mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-04 05:10:06 +06:00
offline mode for firewalled envs (#10407)
* offline mode start * add specific values * fix fallback * add test * better values check and range * test that actually works * document the offline mode * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * more strict check * cleaner test * pt-only test * style Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
parent
90ecc29656
commit
88a951e3cc
@ -155,6 +155,31 @@ If you expect to be downloading large volumes of models (more than 1,000) from o
|
||||
your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way
|
||||
faster, and cheaper. Feel free to contact us privately if you need any help.
|
||||
|
||||
### Offline mode
|
||||
|
||||
It's possible to run 🤗 Transformers in a firewalled or a no-network environment.
|
||||
|
||||
Setting environment variable `TRANSFORMERS_OFFLINE=1` will tell 🤗 Transformers to use local files only and will not try to look things up.
|
||||
|
||||
Most likely you may want to couple this with `HF_DATASETS_OFFLINE=1` that performs the same for 🤗 Datasets if you're using the latter.
|
||||
|
||||
Here is an example of how this can be used on a filesystem that is shared between a normally networked and a firewalled to the external world instances.
|
||||
|
||||
On the instance with the normal network run your program which will download and cache models (and optionally datasets if you use 🤗 Datasets). For example:
|
||||
|
||||
```
|
||||
python examples/seq2seq/run_seq2seq.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
|
||||
```
|
||||
|
||||
and then with the same filesystem you can now run the same program on a firewalled instance:
|
||||
```
|
||||
HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
|
||||
python examples/seq2seq/run_seq2seq.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
|
||||
```
|
||||
and it should succeed without any hanging waiting to timeout.
|
||||
|
||||
|
||||
|
||||
## Do you want to run a Transformer model on a mobile device?
|
||||
|
||||
You should check out our [swift-coreml-transformers](https://github.com/huggingface/swift-coreml-transformers) repo.
|
||||
|
@ -44,15 +44,22 @@ from transformers import (
|
||||
default_data_collator,
|
||||
set_seed,
|
||||
)
|
||||
from transformers.file_utils import is_offline_mode
|
||||
from transformers.trainer_utils import get_last_checkpoint, is_main_process
|
||||
|
||||
|
||||
with FileLock(".lock") as lock:
|
||||
nltk.download("punkt", quiet=True)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
nltk.data.find("tokenizers/punkt")
|
||||
except LookupError:
|
||||
if is_offline_mode():
|
||||
raise LookupError(
|
||||
"Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
|
||||
)
|
||||
with FileLock(".lock") as lock:
|
||||
nltk.download("punkt", quiet=True)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelArguments:
|
||||
|
@ -22,7 +22,7 @@ import os
|
||||
from typing import Any, Dict, Tuple, Union
|
||||
|
||||
from . import __version__
|
||||
from .file_utils import CONFIG_NAME, cached_path, hf_bucket_url, is_remote_url
|
||||
from .file_utils import CONFIG_NAME, cached_path, hf_bucket_url, is_offline_mode, is_remote_url
|
||||
from .utils import logging
|
||||
|
||||
|
||||
@ -412,6 +412,10 @@ class PretrainedConfig(object):
|
||||
local_files_only = kwargs.pop("local_files_only", False)
|
||||
revision = kwargs.pop("revision", None)
|
||||
|
||||
if is_offline_mode() and not local_files_only:
|
||||
logger.info("Offline mode: forcing local_files_only=True")
|
||||
local_files_only = True
|
||||
|
||||
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
|
||||
if os.path.isdir(pretrained_model_name_or_path):
|
||||
config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
|
||||
|
@ -234,6 +234,13 @@ PRESET_MIRROR_DICT = {
|
||||
}
|
||||
|
||||
|
||||
_is_offline_mode = True if os.environ.get("TRANSFORMERS_OFFLINE", "0").upper() in ENV_VARS_TRUE_VALUES else False
|
||||
|
||||
|
||||
def is_offline_mode():
|
||||
return _is_offline_mode
|
||||
|
||||
|
||||
def is_torch_available():
|
||||
return _torch_available
|
||||
|
||||
|
@ -36,6 +36,7 @@ from .file_utils import (
|
||||
ModelOutput,
|
||||
cached_path,
|
||||
hf_bucket_url,
|
||||
is_offline_mode,
|
||||
is_remote_url,
|
||||
replace_return_docstrings,
|
||||
)
|
||||
@ -964,6 +965,10 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
|
||||
revision = kwargs.pop("revision", None)
|
||||
mirror = kwargs.pop("mirror", None)
|
||||
|
||||
if is_offline_mode() and not local_files_only:
|
||||
logger.info("Offline mode: forcing local_files_only=True")
|
||||
local_files_only = True
|
||||
|
||||
# Load config if we don't provide a configuration
|
||||
if not isinstance(config, PretrainedConfig):
|
||||
config_path = config if config is not None else pretrained_model_name_or_path
|
||||
|
@ -44,6 +44,7 @@ from .file_utils import (
|
||||
cached_path,
|
||||
hf_bucket_url,
|
||||
is_flax_available,
|
||||
is_offline_mode,
|
||||
is_remote_url,
|
||||
is_tf_available,
|
||||
is_tokenizers_available,
|
||||
@ -1597,6 +1598,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
revision = kwargs.pop("revision", None)
|
||||
subfolder = kwargs.pop("subfolder", None)
|
||||
|
||||
if is_offline_mode() and not local_files_only:
|
||||
logger.info("Offline mode: forcing local_files_only=True")
|
||||
local_files_only = True
|
||||
|
||||
s3_models = list(cls.max_model_input_sizes.keys())
|
||||
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
|
||||
vocab_files = {}
|
||||
|
53
tests/test_offline.py
Normal file
53
tests/test_offline.py
Normal file
@ -0,0 +1,53 @@
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
from transformers.testing_utils import TestCasePlus, require_torch
|
||||
|
||||
|
||||
class OfflineTests(TestCasePlus):
|
||||
@require_torch
|
||||
def test_offline_mode(self):
|
||||
|
||||
# this test is a bit tricky since TRANSFORMERS_OFFLINE can only be changed before
|
||||
# `transformers` is loaded, and it's too late for inside pytest - so we are changing it
|
||||
# while running an external program
|
||||
|
||||
# python one-liner segments
|
||||
load = "from transformers import BertConfig, BertModel, BertTokenizer;"
|
||||
run = "mname = 'lysandre/tiny-bert-random'; BertConfig.from_pretrained(mname) and BertModel.from_pretrained(mname) and BertTokenizer.from_pretrained(mname);"
|
||||
mock = 'import socket; exec("def offline_socket(*args, **kwargs): raise socket.error(\\"Offline mode is enabled.\\")"); socket.socket = offline_socket;'
|
||||
|
||||
# baseline - just load from_pretrained with normal network
|
||||
cmd = [sys.executable, "-c", f"{load} {run}"]
|
||||
|
||||
# should succeed
|
||||
env = self.get_env()
|
||||
result = subprocess.run(cmd, env=env, check=False, capture_output=True)
|
||||
self.assertEqual(result.returncode, 0, result.stderr)
|
||||
|
||||
# next emulate no network
|
||||
cmd = [sys.executable, "-c", f"{load} {mock} {run}"]
|
||||
|
||||
# should normally fail as it will fail to lookup the model files w/o the network
|
||||
env["TRANSFORMERS_OFFLINE"] = "0"
|
||||
result = subprocess.run(cmd, env=env, check=False, capture_output=True)
|
||||
self.assertEqual(result.returncode, 1, result.stderr)
|
||||
|
||||
# should succeed as TRANSFORMERS_OFFLINE=1 tells it to use local files
|
||||
env["TRANSFORMERS_OFFLINE"] = "1"
|
||||
result = subprocess.run(cmd, env=env, check=False, capture_output=True)
|
||||
self.assertEqual(result.returncode, 0, result.stderr)
|
Loading…
Reference in New Issue
Block a user