Remove all traces of low_cpu_mem_usage (#38792)

* remove it from all py files

* remove it from the doc

* remove it from examples

* style

* remove traces of _fast_init

* Update test_peft_integration.py

* CIs
This commit is contained in:
Cyril Vallez 2025-06-12 16:39:33 +02:00 committed by GitHub
parent 3542e0b844
commit 4b8ec667e9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
76 changed files with 100 additions and 598 deletions

View File

@ -231,7 +231,7 @@ flush()
دعنا نرى ما هو استهلاك ذاكرة GPU الذروة الذي يوفره تكميم 4 بت. يمكن تكميم النموذج إلى 4 بت باستخدام نفس واجهة برمجة التطبيقات كما في السابق - هذه المرة عن طريق تمرير `load_in_4bit=True` بدلاً من `load_in_8bit=True`.
```python
model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, low_cpu_mem_usage=True, pad_token_id=0)
model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, pad_token_id=0)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

View File

@ -459,7 +459,7 @@ args = TrainingArguments(
model_id = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id، low_cpu_mem_usage=True).to(0)
model = AutoModelForCausalLM.from_pretrained(model_id).to(0)
trainer = trl.SFTTrainer(
model=model،
@ -503,7 +503,7 @@ args = TrainingArguments(
# تحميل النموذج والمجزىء اللغوي
model_id = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0)
model = AutoModelForCausalLM.from_pretrained(model_id).to(0)
# تهيئة المدرب
trainer = Trainer(
@ -547,7 +547,7 @@ args = TrainingArguments(
model_id = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0)
model = AutoModelForCausalLM.from_pretrained(model_id).to(0)
trainer = trl.SFTTrainer(
model=model,

View File

@ -51,7 +51,7 @@ torch.random.manual_seed(673)
# load pretrained model and processor
model_id = "llava-hf/llava-1.5-7b-hf"
processor = LlavaProcessor.from_pretrained(model_id)
model = LlavaForConditionalGeneration.from_pretrained(model_id, low_cpu_mem_usage=True)
model = LlavaForConditionalGeneration.from_pretrained(model_id)
# create random image input
random_image = Image.fromarray(torch.randint(0, 256, (224, 224, 3), dtype=torch.uint8).numpy())

View File

@ -236,7 +236,7 @@ flush()
Let's see what peak GPU memory consumption 4-bit quantization gives. Quantizing the model to 4-bit can be done with the same API as before - this time by passing `load_in_4bit=True` instead of `load_in_8bit=True`.
```python
model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, low_cpu_mem_usage=True, pad_token_id=0)
model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, pad_token_id=0)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

View File

@ -170,7 +170,6 @@ model_id = "facebook/chameleon-7b"
model = ChameleonForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
attn_implementation="flash_attention_2"
).to(0)
```

View File

@ -157,7 +157,7 @@ import requests
processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True)
model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16)
model.to("cuda:0")
# prepare image and text prompt, using the appropriate prompt template
@ -292,7 +292,6 @@ from transformers import AutoModelForImageTextToText
model = AutoModelForImageTextToText.from_pretrained(
model_id,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
use_flash_attention_2=True
).to(0)
```

View File

@ -121,7 +121,6 @@ processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-
model = LlavaOnevisionForConditionalGeneration.from_pretrained(
"llava-hf/llava-onevision-qwen2-7b-ov-hf",
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
device_map="cuda:0"
)
@ -286,7 +285,6 @@ from transformers import LlavaOnevisionForConditionalGeneration
model = LlavaOnevisionForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
use_flash_attention_2=True
).to(0)
```

View File

@ -148,11 +148,6 @@ You need enough memory to hold two copies of the model weights (random and pretr
Transformers reduces some of these memory-related challenges with fast initialization, sharded checkpoints, Accelerate's [Big Model Inference](https://hf.co/docs/accelerate/usage_guides/big_modeling) feature, and supporting lower bit data types.
### Fast initialization
A PyTorch model is instantiated with random weights, or "empty" tensors, that take up space in memory without filling it.
Transformers boosts loading speed by skipping random weight initialization with the [_fast_init](https://github.com/huggingface/transformers/blob/c9f6e5e35156e068b227dd9b15521767f6afd4d2/src/transformers/modeling_utils.py#L2710) parameter if the pretrained weights are correctly initialized. This parameter is set to `True` by default.
### Sharded checkpoints
@ -245,7 +240,7 @@ Big Model Inference's second feature relates to how weights are loaded and dispa
Both features combined reduces memory usage and loading times for big pretrained models.
Set [device_map](https://github.com/huggingface/transformers/blob/026a173a64372e9602a16523b8fae9de4b0ff428/src/transformers/modeling_utils.py#L3061) to `"auto"` to enable Big Model Inference. This also sets the [low_cpu_mem_usage](https://github.com/huggingface/transformers/blob/026a173a64372e9602a16523b8fae9de4b0ff428/src/transformers/modeling_utils.py#L3028) parameter to `True`, such that not more than 1x the model size is used in CPU memory.
Set [device_map](https://github.com/huggingface/transformers/blob/026a173a64372e9602a16523b8fae9de4b0ff428/src/transformers/modeling_utils.py#L3061) to `"auto"` to enable Big Model Inference.
```py
from transformers import AutoModelForCausalLM

View File

@ -39,19 +39,8 @@ rendered properly in your Markdown viewer.
Transformers 4.20.0では、[`~PreTrainedModel.from_pretrained`] メソッドが再設計され、[Accelerate](https://huggingface.co/docs/accelerate/big_modeling) を使用して大規模モデルを扱うことが可能になりました。これには Accelerate >= 0.9.0 と PyTorch >= 1.9.0 が必要です。以前の方法でフルモデルを作成し、その後事前学習の重みを読み込む代わりにこれにはメモリ内のモデルサイズが2倍必要で、ランダムに初期化されたモデル用と重み用の2つが必要でした、モデルを空の外殻として作成し、事前学習の重みが読み込まれるときにパラメーターを実体化するオプションが追加されました。
このオプションは `low_cpu_mem_usage=True` で有効にできます。モデルはまず空の重みを持つメタデバイス上に作成され、その後状態辞書が内部に読み込まれますシャードされたチェックポイントの場合、シャードごとに読み込まれます。この方法で使用される最大RAMは、モデルの完全なサイズだけです。
```py
from transformers import AutoModelForSeq2SeqLM
t0pp = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp", low_cpu_mem_usage=True)
```
さらに、モデルが完全にRAMに収まらない場合現時点では推論のみ有効、異なるデバイスにモデルを直接配置できます。`device_map="auto"` を使用すると、Accelerateは各レイヤーをどのデバイスに配置するかを決定し、最速のデバイスGPUを最大限に活用し、残りの部分をCPU、あるいはGPU RAMが不足している場合はハードドライブにオフロードします。モデルが複数のデバイスに分割されていても、通常どおり実行されます。
`device_map` を渡す際、`low_cpu_mem_usage` は自動的に `True` に設定されるため、それを指定する必要はありません。
```py
from transformers import AutoModelForSeq2SeqLM

View File

@ -227,7 +227,7 @@ flush()
이제 4비트 양자화가 제공하는 최대 GPU 메모리 사용량을 확인해 봅시다. 4비트로 모델을 양자화하려면 이전과 동일한 API를 사용하되 이번에는 `load_in_8bit=True` 대신 `load_in_4bit=True`를 전달하면 됩니다.
```python
model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, low_cpu_mem_usage=True, pad_token_id=0)
model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, pad_token_id=0)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

View File

@ -148,7 +148,6 @@ model_id = "facebook/chameleon-7b"
model = ChameleonForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
attn_implementation="flash_attention_2"
).to(0)
```

View File

@ -421,7 +421,7 @@ args = TrainingArguments(
model_id = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0)
model = AutoModelForCausalLM.from_pretrained(model_id).to(0)
trainer = trl.SFTTrainer(
model=model,

View File

@ -29,18 +29,8 @@ http://www.apache.org/licenses/LICENSE-2.0
在 Transformers 4.20.0 中,[`~PreTrainedModel.from_pretrained`] 方法已重新设计,以适应使用 [Accelerate](https://huggingface.co/docs/accelerate/big_modeling) 加载大型模型的场景。这需要您使用的 Accelerate 和 PyTorch 版本满足: Accelerate >= 0.9.0 PyTorch >= 1.9.0。除了创建完整模型,然后在其中加载预训练权重(这会占用两倍于模型大小的内存空间,一个用于随机初始化模型,一个用于预训练权重),我们提供了一种选项,将模型创建为空壳,然后只有在加载预训练权重时才实例化其参数。
您可以使用 `low_cpu_mem_usage=True` 激活此选项。首先,在 Meta 设备上创建模型(带有空权重),然后将状态字典加载到其中(在分片检查点的情况下逐片加载)。这样,最大使用的内存占用仅为模型的完整大小。
```python
from transformers import AutoModelForSeq2SeqLM
t0pp = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp", low_cpu_mem_usage=True)
```
此外,如果内存不足以放下加载整个模型(目前仅适用于推理),您可以直接将模型放置在不同的设备上。使用 `device_map="auto"`Accelerate 将确定将每一层放置在哪个设备上以最大化使用最快的设备GPU并将其余部分卸载到 CPU甚至硬盘上如果您没有足够的 GPU 内存 或 CPU 内存)。即使模型分布在几个设备上,它也将像您通常期望的那样运行。
在传递 `device_map` 时,`low_cpu_mem_usage` 会自动设置为 `True`,因此您不需要指定它:
```python
from transformers import AutoModelForSeq2SeqLM

View File

@ -229,10 +229,6 @@ sure all your batches have the same length.
To use the streaming dataset mode which can be very useful for large datasets, add `--streaming` to the command line. This is supported by `run_mlm.py`, `run_clm.py` and `run_fim.py`. Make sure to adapt the other scripts to your use case by taking inspiration from them.
## Low Cpu Memory Usage
To use low cpu memory mode which can be very useful for LLM, add `--low_cpu_mem_usage` to the command line. This is currently supported by `run_clm.py`,`run_mlm.py`, `run_plm.py`, `run_fim.py`, `run_mlm_no_trainer.py`, `run_clm_no_trainer.py` and `run_fim_no_trainer.py`.
## Creating a model on the fly
When training a model from scratch, configuration values may be overridden with the help of `--config_overrides`:

View File

@ -139,15 +139,6 @@ class ModelArguments:
"choices": ["auto", "bfloat16", "float16", "float32"],
},
)
low_cpu_mem_usage: bool = field(
default=False,
metadata={
"help": (
"It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
"set True will benefit LLM loading time and RAM consumption."
)
},
)
def __post_init__(self):
if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
@ -432,7 +423,6 @@ def main():
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
torch_dtype=torch_dtype,
low_cpu_mem_usage=model_args.low_cpu_mem_usage,
)
else:
model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)

View File

@ -228,14 +228,6 @@ def parse_args():
"Only applicable when `--with_tracking` is passed."
),
)
parser.add_argument(
"--low_cpu_mem_usage",
action="store_true",
help=(
"It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
"If passed, LLM loading time and RAM consumption will be benefited."
),
)
args = parser.parse_args()
# Sanity checks
@ -409,7 +401,6 @@ def main():
args.model_name_or_path,
from_tf=bool(".ckpt" in args.model_name_or_path),
config=config,
low_cpu_mem_usage=args.low_cpu_mem_usage,
trust_remote_code=args.trust_remote_code,
)
else:

View File

@ -142,15 +142,6 @@ class ModelArguments:
"choices": ["auto", "bfloat16", "float16", "float32"],
},
)
low_cpu_mem_usage: bool = field(
default=False,
metadata={
"help": (
"It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
"set True will benefit LLM loading time and RAM consumption."
)
},
)
pad_to_multiple_of: bool = field(
default=False,
metadata={
@ -501,7 +492,6 @@ def main():
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
torch_dtype=torch_dtype,
low_cpu_mem_usage=model_args.low_cpu_mem_usage,
attn_implementation=model_args.attn_implementation,
)

View File

@ -288,14 +288,6 @@ def parse_args():
"Only applicable when `--with_tracking` is passed."
),
)
parser.add_argument(
"--low_cpu_mem_usage",
action="store_true",
help=(
"It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
"If passed, LLM loading time and RAM consumption will be benefited."
),
)
args = parser.parse_args()
# Sanity checks
@ -474,7 +466,6 @@ def main():
args.model_name_or_path,
from_tf=bool(".ckpt" in args.model_name_or_path),
config=config,
low_cpu_mem_usage=args.low_cpu_mem_usage,
trust_remote_code=args.trust_remote_code,
)
else:

View File

@ -136,15 +136,6 @@ class ModelArguments:
"choices": ["auto", "bfloat16", "float16", "float32"],
},
)
low_cpu_mem_usage: bool = field(
default=False,
metadata={
"help": (
"It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
"set True will benefit LLM loading time and RAM consumption."
)
},
)
def __post_init__(self):
if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
@ -436,7 +427,6 @@ def main():
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
torch_dtype=torch_dtype,
low_cpu_mem_usage=model_args.low_cpu_mem_usage,
)
else:
logger.info("Training new model from scratch")

View File

@ -235,14 +235,6 @@ def parse_args():
"Only applicable when `--with_tracking` is passed."
),
)
parser.add_argument(
"--low_cpu_mem_usage",
action="store_true",
help=(
"It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
"If passed, LLM loading time and RAM consumption will be benefited."
),
)
args = parser.parse_args()
# Sanity checks
@ -406,7 +398,6 @@ def main():
args.model_name_or_path,
from_tf=bool(".ckpt" in args.model_name_or_path),
config=config,
low_cpu_mem_usage=args.low_cpu_mem_usage,
trust_remote_code=args.trust_remote_code,
)
else:

View File

@ -103,15 +103,6 @@ class ModelArguments:
)
},
)
low_cpu_mem_usage: bool = field(
default=False,
metadata={
"help": (
"It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
"set True will benefit LLM loading time and RAM consumption."
)
},
)
def __post_init__(self):
if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
@ -397,7 +388,6 @@ def main():
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
token=model_args.token,
low_cpu_mem_usage=model_args.low_cpu_mem_usage,
)
else:
logger.info("Training new model from scratch")

View File

@ -429,7 +429,7 @@ def model_addition_debugger_context(
# load pretrained model and processor
model_id = "llava-hf/llava-1.5-7b-hf"
processor = LlavaProcessor.from_pretrained(model_id)
model = LlavaForConditionalGeneration.from_pretrained(model_id, low_cpu_mem_usage=True)
model = LlavaForConditionalGeneration.from_pretrained(model_id)
# create random image input
random_image = Image.fromarray(torch.randint(0, 256, (224, 224, 3), dtype=torch.uint8).numpy())

View File

@ -37,7 +37,7 @@ Example for creating the old state dict file with Python:
# load model
kwargs = {"device_map": "auto", "torch_dtype": torch.float16}
model = AriaTextForCausalLM.from_pretrained("rhymes-ai/Aria", low_cpu_mem_usage=True, **kwargs)
model = AriaTextForCausalLM.from_pretrained("rhymes-ai/Aria", **kwargs)
# load vision tower
model.get_vision_tower().load_model()

View File

@ -41,9 +41,7 @@ CONVERSION_MAPPING = {
def convert_falcon_h1_to_hf(input_model_path, output_path):
tokenizer = AutoTokenizer.from_pretrained(input_model_path)
model = AutoModelForCausalLM.from_pretrained(
input_model_path, torch_dtype=torch.bfloat16, trust_remote_code=True, low_cpu_mem_usage=True
)
model = AutoModelForCausalLM.from_pretrained(input_model_path, torch_dtype=torch.bfloat16, trust_remote_code=True)
intermediate_size = int(model.config.expansion_factor * model.config.hidden_size)

View File

@ -187,7 +187,6 @@ def load_original_state_dict(input_base_path):
model = AutoModel.from_pretrained(
input_base_path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=False,
trust_remote_code=True,
).eval()

View File

@ -419,7 +419,7 @@ def write_model(
gc.collect()
print("Loading the checkpoint in a Llama model.")
model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.bfloat16)
# Avoid saving this as part of the config.
del model.config._name_or_path

View File

@ -40,7 +40,7 @@ Example for creating the old state dict file with Python:
# load model
kwargs = {"device_map": "auto", "torch_dtype": torch.float16}
model = LlavaLlamaForCausalLM.from_pretrained("liuhaotian/llava-v1.5-7b", low_cpu_mem_usage=True, **kwargs)
model = LlavaLlamaForCausalLM.from_pretrained("liuhaotian/llava-v1.5-7b", **kwargs)
# load vision tower
model.get_vision_tower().load_model()

View File

@ -175,7 +175,7 @@ def write_model(model_path, input_base_path, tokenizer_path=None, safe_serializa
_write_tokenizer(model_path, config, tokenizer_path, fix_eos_token_id)
print("Loading the checkpoint in a OLMo model.")
model = OlmoForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float32, low_cpu_mem_usage=True)
model = OlmoForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float32)
# Avoid saving this as part of the config.
del model.config._name_or_path
print("Saving in the Transformers format.")

View File

@ -205,7 +205,7 @@ def write_model(
_write_tokenizer(model_path, config, input_base_path, tokenizer_path)
print("Loading the checkpoint in a OLMo2 model.")
model = Olmo2ForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float32, low_cpu_mem_usage=True)
model = Olmo2ForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float32)
# Avoid saving this as part of the config.
del model.config._name_or_path
print("Saving in the Transformers format.")

View File

@ -37,7 +37,7 @@ Example for creating the old state dict file with Python:
# load model
kwargs = {"device_map": "auto", "torch_dtype": torch.float16}
model = VideoLlavaForCausalLM.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", low_cpu_mem_usage=True, **kwargs)
model = VideoLlavaForCausalLM.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", **kwargs)
# load vision tower
model.get_vision_tower().load_model()

View File

@ -2337,7 +2337,6 @@ class GenerationTesterMixin:
model_eager = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
attn_implementation="eager",
).to(torch_device)
res_eager = model_eager.generate(**inputs_dict, **generate_kwargs)
@ -2347,7 +2346,6 @@ class GenerationTesterMixin:
model_attn = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
attn_implementation=attn_implementation,
).to(torch_device)
res_attn = model_attn.generate(**inputs_dict, **generate_kwargs)
@ -3724,7 +3722,6 @@ class GenerationIntegrationTests(unittest.TestCase):
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id,
low_cpu_mem_usage=True,
use_safetensors=True,
)
model.to(torch_device)
@ -3743,7 +3740,6 @@ class GenerationIntegrationTests(unittest.TestCase):
# Load its decoder only version:
assistant_causal_lm = AutoModelForCausalLM.from_pretrained(
assistant_distil_model_id,
low_cpu_mem_usage=True,
use_safetensors=True,
).to(torch_device)
self.assertTrue(model.generate(**features, assistant_model=assistant_causal_lm).sum())
@ -3759,7 +3755,6 @@ class GenerationIntegrationTests(unittest.TestCase):
# Load its decoder only version:
assistant_causal_lm = AutoModelForCausalLM.from_pretrained(
assistant_distil_model_id,
low_cpu_mem_usage=True,
use_safetensors=True,
).to(torch_device)
# It will raise an error as the encoder of the main and assistant model are not compatible:

View File

@ -556,7 +556,6 @@ class BambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
tmpdirname,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
low_cpu_mem_usage=True,
)
.to(torch_device)
.eval()
@ -600,7 +599,7 @@ class BambaModelIntegrationTest(unittest.TestCase):
@classmethod
def setUpClass(cls):
model_id = "ibm-fms/Bamba-9B"
cls.model = BambaForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
cls.model = BambaForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
cls.tokenizer = AutoTokenizer.from_pretrained(model_id)
# feels a bit forced to have to do this for the generation test

View File

@ -238,9 +238,7 @@ class CohereIntegrationTest(unittest.TestCase):
).to(device=torch_device, dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = CohereForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to(
torch_device
)
model = CohereForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(torch_device)
tokenizer.pad_token = tokenizer.eos_token

View File

@ -144,7 +144,7 @@ class Cohere2IntegrationTest(unittest.TestCase):
]
model = AutoModelForCausalLM.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="eager"
model_id, torch_dtype=torch.bfloat16, attn_implementation="eager"
).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
@ -168,7 +168,7 @@ class Cohere2IntegrationTest(unittest.TestCase):
# fmt: on
model = AutoModelForCausalLM.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, attn_implementation="eager"
model_id, torch_dtype=torch.float16, attn_implementation="eager"
).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
@ -189,7 +189,7 @@ class Cohere2IntegrationTest(unittest.TestCase):
]
model = AutoModelForCausalLM.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flex_attention"
model_id, torch_dtype=torch.bfloat16, attn_implementation="flex_attention"
).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

View File

@ -280,18 +280,6 @@ class DacModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
def test_hidden_states_output(self):
pass
@unittest.skip("No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage(self):
pass
@unittest.skip("No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage_checkpoints(self):
pass
@unittest.skip("No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage_no_safetensors(self):
pass
def test_determinism(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

View File

@ -459,7 +459,6 @@ class DeepseekV3ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
model_sdpa = DeepseekV3ForCausalLM.from_pretrained(
"bzantium/tiny-deepseek-v3",
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
).to(torch_device)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
@ -467,7 +466,6 @@ class DeepseekV3ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
model_eager = DeepseekV3ForCausalLM.from_pretrained(
"bzantium/tiny-deepseek-v3",
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
attn_implementation="eager",
).to(torch_device)

View File

@ -605,18 +605,6 @@ class DeformableDetrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Te
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage_checkpoints(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage_no_safetensors(self):
pass
def test_two_stage_training(self):
model_class = DeformableDetrForObjectDetection
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

View File

@ -514,7 +514,6 @@ class DiffLlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
model_sdpa = DiffLlamaForCausalLM.from_pretrained(
"kajuma/DiffLlama-0.3B-handcut",
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
).to(torch_device)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
@ -522,7 +521,6 @@ class DiffLlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
model_eager = DiffLlamaForCausalLM.from_pretrained(
"kajuma/DiffLlama-0.3B-handcut",
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
attn_implementation="eager",
).to(torch_device)

View File

@ -343,18 +343,6 @@ class EncodecModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
def test_hidden_states_output(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage_checkpoints(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage_no_safetensors(self):
pass
def test_determinism(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

View File

@ -381,9 +381,7 @@ class FalconMambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest
@slow
# Ignore copy
def test_model_from_pretrained(self):
model = FalconMambaModel.from_pretrained(
"tiiuae/falcon-mamba-7b", torch_dtype=torch.float16, low_cpu_mem_usage=True
)
model = FalconMambaModel.from_pretrained("tiiuae/falcon-mamba-7b", torch_dtype=torch.float16)
self.assertIsNotNone(model)
def test_model_outputs_equivalence(self):

View File

@ -126,9 +126,7 @@ class GemmaIntegrationTest(unittest.TestCase):
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
]
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to(
torch_device
)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(torch_device)
model.generation_config.cache_implementation = "static"
@ -149,9 +147,7 @@ class GemmaIntegrationTest(unittest.TestCase):
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
]
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
torch_device
)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@ -171,9 +167,7 @@ class GemmaIntegrationTest(unittest.TestCase):
]
# bfloat16 gives strange values, likely due to it has lower precision + very short prompts
model = AutoModelForCausalLM.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, attn_implementation="eager"
)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, attn_implementation="eager")
model.to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
@ -195,7 +189,7 @@ class GemmaIntegrationTest(unittest.TestCase):
]
model = AutoModelForCausalLM.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
model_id, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
)
model.to(torch_device)
@ -216,7 +210,7 @@ class GemmaIntegrationTest(unittest.TestCase):
"Hi today I'd like to share with you my experience with the new wattpad wattpad wattpad wattpad wattpad wattpad wattpad",
]
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@ -235,7 +229,7 @@ class GemmaIntegrationTest(unittest.TestCase):
"Hi,\n\nI have a problem with my 2005 1.6 16",
]
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(torch_device)
model = AutoModelForCausalLM.from_pretrained(model_id).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@ -256,9 +250,7 @@ class GemmaIntegrationTest(unittest.TestCase):
"Hi today I am going to show you how to make a simple and easy to make a DIY 3D",
]
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to(
torch_device
)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@ -290,9 +282,7 @@ class GemmaIntegrationTest(unittest.TestCase):
# fmt: on
expected_text = EXPECTED_TEXTS.get_expectation()
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
torch_device
)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@ -312,9 +302,7 @@ class GemmaIntegrationTest(unittest.TestCase):
"Hi today I am going to show you how to make a simple and easy to make a DIY 3D",
]
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to(
torch_device
)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(torch_device)
model.generation_config.cache_implementation = "static"
@ -333,7 +321,7 @@ class GemmaIntegrationTest(unittest.TestCase):
"Hi today I am going to talk about the best way to get rid of acne. miniaturing is a very",
]
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@ -451,9 +439,7 @@ class GemmaIntegrationTest(unittest.TestCase):
"Hi today we have the review for a <strong>2016/2017</strong> season of",
]
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
torch_device
)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)

View File

@ -197,7 +197,7 @@ class Gemma2IntegrationTest(unittest.TestCase):
]
model = AutoModelForCausalLM.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="eager"
model_id, torch_dtype=torch.bfloat16, attn_implementation="eager"
).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
@ -218,7 +218,7 @@ class Gemma2IntegrationTest(unittest.TestCase):
]
model = AutoModelForCausalLM.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, attn_implementation="eager"
model_id, torch_dtype=torch.float16, attn_implementation="eager"
).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
@ -241,7 +241,7 @@ class Gemma2IntegrationTest(unittest.TestCase):
]
model = AutoModelForCausalLM.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flex_attention"
model_id, torch_dtype=torch.bfloat16, attn_implementation="flex_attention"
).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
@ -271,7 +271,7 @@ class Gemma2IntegrationTest(unittest.TestCase):
EXPECTED_BATCH_TEXT = EXPECTED_BATCH_TEXTS.get_expectation()
model = AutoModelForCausalLM.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flex_attention"
model_id, torch_dtype=torch.bfloat16, attn_implementation="flex_attention"
).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
@ -419,7 +419,7 @@ class Gemma2IntegrationTest(unittest.TestCase):
]
model = AutoModelForCausalLM.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flex_attention"
model_id, torch_dtype=torch.bfloat16, attn_implementation="flex_attention"
).to(torch_device)
assert model.config._attn_implementation == "flex_attention"
tokenizer = AutoTokenizer.from_pretrained(model_id)

View File

@ -391,9 +391,7 @@ class Gemma3IntegrationTest(unittest.TestCase):
def test_model_4b_bf16(self):
model_id = "google/gemma-3-4b-it"
model = Gemma3ForConditionalGeneration.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
).to(torch_device)
model = Gemma3ForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)
inputs = self.processor.apply_chat_template(
self.messages,
@ -421,9 +419,7 @@ class Gemma3IntegrationTest(unittest.TestCase):
def test_model_4b_batch(self):
model_id = "google/gemma-3-4b-it"
model = Gemma3ForConditionalGeneration.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
).to(torch_device)
model = Gemma3ForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)
messages_2 = [
{"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
@ -474,9 +470,7 @@ class Gemma3IntegrationTest(unittest.TestCase):
def test_model_4b_crops(self):
model_id = "google/gemma-3-4b-it"
model = Gemma3ForConditionalGeneration.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
).to(torch_device)
model = Gemma3ForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)
crop_config = {
"images_kwargs": {
@ -516,9 +510,7 @@ class Gemma3IntegrationTest(unittest.TestCase):
def test_model_4b_batch_crops(self):
model_id = "google/gemma-3-4b-it"
model = Gemma3ForConditionalGeneration.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
).to(torch_device)
model = Gemma3ForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)
crop_config = {
"images_kwargs": {
"do_pan_and_scan": True,
@ -576,9 +568,7 @@ class Gemma3IntegrationTest(unittest.TestCase):
def test_model_4b_multiimage(self):
model_id = "google/gemma-3-4b-it"
model = Gemma3ForConditionalGeneration.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
).to(torch_device)
model = Gemma3ForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)
messages = [
{"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
@ -616,9 +606,7 @@ class Gemma3IntegrationTest(unittest.TestCase):
def test_model_1b_text_only(self):
model_id = "google/gemma-3-1b-it"
model = Gemma3ForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
torch_device
)
model = Gemma3ForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
inputs = tokenizer("Write a poem about Machine Learning.", return_tensors="pt").to(torch_device)

View File

@ -88,7 +88,7 @@ class GlmIntegrationTest(unittest.TestCase):
]
model = AutoModelForCausalLM.from_pretrained(
self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, revision=self.revision
self.model_id, torch_dtype=torch.float16, revision=self.revision
).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision)
@ -106,7 +106,7 @@ class GlmIntegrationTest(unittest.TestCase):
]
model = AutoModelForCausalLM.from_pretrained(
self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, revision=self.revision
self.model_id, torch_dtype=torch.bfloat16, revision=self.revision
).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision)
@ -125,7 +125,6 @@ class GlmIntegrationTest(unittest.TestCase):
model = AutoModelForCausalLM.from_pretrained(
self.model_id,
low_cpu_mem_usage=True,
torch_dtype=torch.bfloat16,
attn_implementation="eager",
revision=self.revision,
@ -149,7 +148,6 @@ class GlmIntegrationTest(unittest.TestCase):
model = AutoModelForCausalLM.from_pretrained(
self.model_id,
low_cpu_mem_usage=True,
torch_dtype=torch.bfloat16,
attn_implementation="sdpa",
revision=self.revision,
@ -174,7 +172,6 @@ class GlmIntegrationTest(unittest.TestCase):
model = AutoModelForCausalLM.from_pretrained(
self.model_id,
low_cpu_mem_usage=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
revision=self.revision,

View File

@ -104,9 +104,7 @@ class Glm4IntegrationTest(unittest.TestCase):
)
EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
model = AutoModelForCausalLM.from_pretrained(
self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16
).to(torch_device)
model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.float16).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(self.model_id)
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@ -132,9 +130,7 @@ class Glm4IntegrationTest(unittest.TestCase):
)
EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
model = AutoModelForCausalLM.from_pretrained(
self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
).to(torch_device)
model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.bfloat16).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(self.model_id)
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@ -162,7 +158,6 @@ class Glm4IntegrationTest(unittest.TestCase):
model = AutoModelForCausalLM.from_pretrained(
self.model_id,
low_cpu_mem_usage=True,
torch_dtype=torch.bfloat16,
attn_implementation="eager",
)
@ -195,7 +190,6 @@ class Glm4IntegrationTest(unittest.TestCase):
model = AutoModelForCausalLM.from_pretrained(
self.model_id,
low_cpu_mem_usage=True,
torch_dtype=torch.bfloat16,
attn_implementation="sdpa",
)
@ -226,7 +220,6 @@ class Glm4IntegrationTest(unittest.TestCase):
model = AutoModelForCausalLM.from_pretrained(
self.model_id,
low_cpu_mem_usage=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
)

View File

@ -87,9 +87,9 @@ class HeliumIntegrationTest(unittest.TestCase):
"Hello, today is a great day to start a new project. I have been working on a new project for a while now and I have"
]
model = AutoModelForCausalLM.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, revision="refs/pr/1"
).to(torch_device)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, revision="refs/pr/1").to(
torch_device
)
tokenizer = AutoTokenizer.from_pretrained(model_id, revision="refs/pr/1")
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)

View File

@ -727,7 +727,7 @@ class InstructBlipModelIntegrationTest(unittest.TestCase):
def test_inference_vicuna_7b(self):
processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
model = InstructBlipForConditionalGeneration.from_pretrained(
"Salesforce/instructblip-vicuna-7b", load_in_8bit=True, low_cpu_mem_usage=True
"Salesforce/instructblip-vicuna-7b", load_in_8bit=True
)
url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
@ -752,7 +752,6 @@ class InstructBlipModelIntegrationTest(unittest.TestCase):
model = InstructBlipForConditionalGeneration.from_pretrained(
"Salesforce/instructblip-flan-t5-xl",
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
).to(torch_device)
url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
@ -789,7 +788,6 @@ class InstructBlipModelIntegrationTest(unittest.TestCase):
model = InstructBlipForConditionalGeneration.from_pretrained(
"Salesforce/instructblip-flan-t5-xl",
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
).to(torch_device)
processor.image_processor.size = {"height": 500, "width": 500}
@ -810,7 +808,6 @@ class InstructBlipModelIntegrationTest(unittest.TestCase):
model = InstructBlipForConditionalGeneration.from_pretrained(
"Salesforce/instructblip-flan-t5-xl",
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
).to(torch_device)
image = prepare_img()

View File

@ -744,7 +744,8 @@ class InstructBlipVideoModelIntegrationTest(unittest.TestCase):
def test_inference_vicuna_7b(self):
processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
model = InstructBlipVideoForConditionalGeneration.from_pretrained(
"Salesforce/instructblip-vicuna-7b", load_in_8bit=True, low_cpu_mem_usage=True
"Salesforce/instructblip-vicuna-7b",
load_in_8bit=True,
)
clip = prepare_video()
@ -762,7 +763,8 @@ class InstructBlipVideoModelIntegrationTest(unittest.TestCase):
def test_expansion_in_processing(self):
processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
model = InstructBlipVideoForConditionalGeneration.from_pretrained(
"Salesforce/instructblip-vicuna-7b", load_in_8bit=True, low_cpu_mem_usage=True
"Salesforce/instructblip-vicuna-7b",
load_in_8bit=True,
)
clip = prepare_video()

View File

@ -527,7 +527,6 @@ class JambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
tmpdirname,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
low_cpu_mem_usage=True,
load_in_4bit=True,
)
@ -563,7 +562,10 @@ class JambaModelIntegrationTest(unittest.TestCase):
@classmethod
def setUpClass(cls):
model_id = "ai21labs/Jamba-tiny-dev"
cls.model = JambaForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
cls.model = JambaForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
)
cls.tokenizer = AutoTokenizer.from_pretrained(model_id)
cls.device_properties = get_device_properties()

View File

@ -765,18 +765,6 @@ class LxmertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
return tf_inputs_dict
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage_checkpoints(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage_no_safetensors(self):
pass
@unittest.skip(
reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
)

View File

@ -351,18 +351,6 @@ class MarianModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
def test_training_gradient_checkpointing_use_reentrant_false(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage_checkpoints(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage_no_safetensors(self):
pass
def assert_tensors_close(a, b, atol=1e-12, prefix=""):
"""If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""

View File

@ -246,9 +246,10 @@ class MiniMaxIntegrationTest(unittest.TestCase):
model_id = "hf-internal-testing/MiniMax-tiny"
dummy_input = torch.LongTensor([[0, 1, 0], [0, 1, 0]]).to(torch_device)
model = MiniMaxForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).to(
torch_device
)
model = MiniMaxForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
).to(torch_device)
expected_slice = torch.tensor(
[[1.0312, -0.5156, -0.3262], [-0.1152, 0.4336, 0.2412], [1.2188, -0.5898, -0.0381]]
).to(torch_device)
@ -265,9 +266,10 @@ class MiniMaxIntegrationTest(unittest.TestCase):
model_id = "hf-internal-testing/MiniMax-tiny"
dummy_input = torch.LongTensor([[0, 1, 0], [0, 1, 0]]).to(torch_device)
model = MiniMaxForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).to(
torch_device
)
model = MiniMaxForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
).to(torch_device)
expected_slice = (
torch.tensor([[0, 1, 0, 933, 307, 3102, 2457, 1208], [0, 1, 0, 933, 307, 3102, 2457, 1208]])
.to(torch.int64)

View File

@ -156,9 +156,10 @@ class MixtralIntegrationTest(unittest.TestCase):
model_id = "hf-internal-testing/Mixtral-tiny"
dummy_input = torch.LongTensor([[0, 1, 0], [0, 1, 0]]).to(torch_device)
model = MixtralForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).to(
torch_device
)
model = MixtralForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
).to(torch_device)
# TODO: might need to tweak it in case the logits do not match on our daily runners
# these logits have been obtained with the original megablocks implementation.
# ("cuda", 8) for A100/A10, and ("cuda", 7) for T4
@ -189,9 +190,10 @@ class MixtralIntegrationTest(unittest.TestCase):
dummy_input = torch.LongTensor([[0, 0, 0, 0, 0, 0, 1, 2, 3], [1, 1, 2, 3, 4, 5, 6, 7, 8]]).to(torch_device)
attention_mask = dummy_input.ne(0).to(torch.long)
model = MixtralForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).to(
torch_device
)
model = MixtralForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
).to(torch_device)
# TODO: might need to tweak it in case the logits do not match on our daily runners
#

View File

@ -722,7 +722,6 @@ class MoshiTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
model_sdpa = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
).to(torch_device)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
@ -730,7 +729,6 @@ class MoshiTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
model_eager = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
attn_implementation="eager",
).to(torch_device)

View File

@ -788,18 +788,6 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
def test_tied_weights_keys(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage_checkpoints(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage_no_safetensors(self):
pass
# override since changing `output_hidden_states` / `output_attentions` from the top-level model config won't work
def test_retain_grad_hidden_states_attentions(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

View File

@ -789,18 +789,6 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
def test_tied_weights_keys(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage_checkpoints(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage_no_safetensors(self):
pass
# override since changing `output_hidden_states` / `output_attentions` from the top-level model config won't work
# Ignore copy
def test_retain_grad_hidden_states_attentions(self):

View File

@ -326,18 +326,6 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
def test_feed_forward_chunking(self):
pass
@unittest.skip(reason="PaliGemma does not support low_cpu_mem_usage.")
def test_save_load_low_cpu_mem_usage(self):
pass
@unittest.skip(reason="PaliGemma does not support low_cpu_mem_usage.")
def test_save_load_low_cpu_mem_usage_checkpoints(self):
pass
@unittest.skip(reason="PaliGemma does not support low_cpu_mem_usage.")
def test_save_load_low_cpu_mem_usage_no_safetensors(self):
pass
@unittest.skip(
reason="VLMs doesn't accept inputs embeds and pixel values at the same time. So if the test passed for backbone LM, it passes for VLM also"
)

View File

@ -316,18 +316,6 @@ class PaliGemma2ForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
def test_feed_forward_chunking(self):
pass
@unittest.skip(reason="PaliGemma does not support low_cpu_mem_usage.")
def test_save_load_low_cpu_mem_usage(self):
pass
@unittest.skip(reason="PaliGemma does not support low_cpu_mem_usage.")
def test_save_load_low_cpu_mem_usage_checkpoints(self):
pass
@unittest.skip(reason="PaliGemma does not support low_cpu_mem_usage.")
def test_save_load_low_cpu_mem_usage_no_safetensors(self):
pass
@unittest.skip(
reason="VLMs doesn't accept inputs embeds and pixel values at the same time. So if the test passed for backbone LM, it passes for VLM also"
)

View File

@ -368,10 +368,6 @@ class Qwen2_5_VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
def test_prompt_lookup_decoding_matches_greedy_search(self):
super().test_prompt_lookup_decoding_matches_greedy_search()
@unittest.skip(reason="The base class is LM only and cannot be init with XModelConfig`")
def test_save_load_fast_init_from_base(self):
pass
# The multimodal base model embeds will not match ids, due to pixel values. We can't change base test
# because in some models `pixel_values` are required. Will be fixed when we add support for merging `embeds+pixels`
# TODO: @raushan

View File

@ -318,10 +318,6 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
def test_generate_from_inputs_embeds_with_static_cache(self):
pass
@unittest.skip(reason="The base class is LM only and cannot be init with XModelConfig`")
def test_save_load_fast_init_from_base(self):
pass
# The multimodal base model embeds will not match ids, due to pixel values. We can't change base test
# because in some models `pixel_values` are required. Will be fixed when we add support for merging `embeds+pixels`
# TODO: @raushan

View File

@ -182,7 +182,9 @@ class RecurrentGemmaIntegrationTest(unittest.TestCase):
@require_read_token
def test_2b_generate(self):
EXPECTED_TEXTS = ['Hello I am doing a project on the topic of "The impact of the internet on the society" and I am looking for some information on the topic. I am looking for some information on the impact of the internet on the society. I am looking for some information on the impact of the internet on the society. I am looking for some', 'Hi today is a new app that allows you to make money by watching videos.\n\nThe app is very simple to use and you can earn money by watching videos.\n\nThe app is available for both Android and iOS devices and you can download it from the Google Play Store or the App Store.\n\nOnce you have downloaded the app'] # fmt: skip
model = AutoModelForCausalLM.from_pretrained(self.model_id, low_cpu_mem_usage=True).to(torch_device)
model = AutoModelForCausalLM.from_pretrained(
self.model_id,
).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(self.model_id)
tokenizer.padding_side = "right"
@ -204,9 +206,7 @@ class RecurrentGemmaIntegrationTest(unittest.TestCase):
self.assertEqual(output_text, EXPECTED_TEXTS)
model = AutoModelForCausalLM.from_pretrained(
self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16
).to(torch_device)
model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.float16).to(torch_device)
output = model.generate(**inputs, max_new_tokens=64, do_sample=False)
del model
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
@ -246,9 +246,7 @@ class RecurrentGemmaIntegrationTest(unittest.TestCase):
def test_long_context(self):
EXPECTED_GENERATION = [' Jean-Paul Delannoy told CNN that the BEA is "not aware of any video footage that could have been taken on board the plane." He added that the BEA is "not aware of any video footage that could have been taken on board the plane." The BEA is the French equivalent of the National Transportation Safety Board'] # fmt: skip
model = AutoModelForCausalLM.from_pretrained(
self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16
).to(torch_device)
model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.float16).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(self.model_id, padding_side="left")
inputs = tokenizer(self.input_long_text, return_tensors="pt").to(torch_device)
output = model.generate(**inputs, max_new_tokens=64, do_sample=False)
@ -260,9 +258,7 @@ class RecurrentGemmaIntegrationTest(unittest.TestCase):
def test_longer_than_window(self):
EXPECTED_GENERATION = [" Robin's comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. Paris Match and Bild reported that the"] # fmt: skip
model = AutoModelForCausalLM.from_pretrained(
self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16
).to(torch_device)
model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.float16).to(torch_device)
model.config.attention_window_size = 256 # Make the attention window size shorter than the current prompt
tokenizer = AutoTokenizer.from_pretrained(self.model_id, padding_side="left")
inputs = tokenizer(self.input_long_text, return_tensors="pt").to(torch_device)

View File

@ -248,14 +248,6 @@ class SamVisionModelTest(ModelTesterMixin, unittest.TestCase):
def test_training_gradient_checkpointing_use_reentrant_false(self):
pass
@unittest.skip(reason="SamVisionModel has no base class and is not available in MODEL_MAPPING")
def test_save_load_fast_init_from_base(self):
pass
@unittest.skip(reason="SamVisionModel has no base class and is not available in MODEL_MAPPING")
def test_save_load_fast_init_to_base(self):
pass
@unittest.skip(reason="SamVisionModel does not support training")
def test_retain_grad_hidden_states_attentions(self):
pass

View File

@ -256,14 +256,6 @@ class SamHQVisionModelTest(ModelTesterMixin, unittest.TestCase):
def test_training_gradient_checkpointing_use_reentrant_false(self):
pass
@unittest.skip(reason="SamVisionModel has no base class and is not available in MODEL_MAPPING")
def test_save_load_fast_init_from_base(self):
pass
@unittest.skip(reason="SamVisionModel has no base class and is not available in MODEL_MAPPING")
def test_save_load_fast_init_to_base(self):
pass
@unittest.skip(reason="SamVisionModel does not support training")
def test_retain_grad_hidden_states_attentions(self):
pass
@ -695,14 +687,6 @@ class SamHQModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
def test_training_gradient_checkpointing_use_reentrant_false(self):
pass
@unittest.skip(reason="SamHQModel has no base class and is not available in MODEL_MAPPING")
def test_save_load_fast_init_from_base(self):
pass
@unittest.skip(reason="SamHQModel has no base class and is not available in MODEL_MAPPING")
def test_save_load_fast_init_to_base(self):
pass
@unittest.skip(reason="SamHQModel does not support training")
def test_retain_grad_hidden_states_attentions(self):
pass

View File

@ -325,18 +325,6 @@ class SEWModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
def test_model_get_set_embeddings(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage_checkpoints(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage_no_safetensors(self):
pass
def test_retain_grad_hidden_states_attentions(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.output_hidden_states = True

View File

@ -430,18 +430,6 @@ class SEWDModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
def test_feed_forward_chunking(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage_checkpoints(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage_no_safetensors(self):
pass
@slow
def test_model_from_pretrained(self):
model = SEWDModel.from_pretrained("asapp/sew-d-tiny-100k")

View File

@ -49,9 +49,9 @@ class ShieldGemma2IntegrationTest(unittest.TestCase):
response = requests.get(url)
image = Image.open(BytesIO(response.content))
model = ShieldGemma2ForImageClassification.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
).to(torch_device)
model = ShieldGemma2ForImageClassification.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(
torch_device
)
inputs = processor(images=[image]).to(torch_device)
output = model(**inputs)

View File

@ -1109,14 +1109,16 @@ class T5ModelFp16Tests(unittest.TestCase):
# Load using `accelerate` in bf16
model = T5ForConditionalGeneration.from_pretrained(
"google-t5/t5-small", torch_dtype=torch.bfloat16, low_cpu_mem_usage=True
"google-t5/t5-small",
torch_dtype=torch.bfloat16,
)
self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.bfloat16)
self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == torch.bfloat16)
# Load without using `accelerate`
model = T5ForConditionalGeneration.from_pretrained(
"google-t5/t5-small", torch_dtype=torch.float16, low_cpu_mem_usage=True
"google-t5/t5-small",
torch_dtype=torch.float16,
)
self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32)
self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == torch.float16)

View File

@ -156,18 +156,6 @@ class TimmBackboneModelTest(ModelTesterMixin, BackboneTesterMixin, PipelineTeste
def test_save_load(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage_checkpoints(self):
pass
@unittest.skip(reason="No support for low_cpu_mem_usage=True.")
def test_save_load_low_cpu_mem_usage_no_safetensors(self):
pass
@unittest.skip(reason="TimmBackbone uses its own `from_pretrained` without device_map support")
def test_can_load_with_device_context_manager(self):
pass

View File

@ -407,12 +407,6 @@ class UdopModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
normalized_1 = F.softmax(out_shared_prefix_last_tokens)
torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4)
@unittest.skip(
"Not currently compatible. Fails with - NotImplementedError: Cannot copy out of meta tensor; no data!"
)
def test_save_load_low_cpu_mem_usage(self):
pass
@slow
def test_model_from_pretrained(self):
model_name = "microsoft/udop-large"
@ -615,12 +609,6 @@ class UdopEncoderOnlyModelTest(ModelTesterMixin, unittest.TestCase):
normalized_1 = F.softmax(out_shared_prefix_last_tokens)
torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4)
@unittest.skip(
"Not currently compatible. Fails with - NotImplementedError: Cannot copy out of meta tensor; no data!"
)
def test_save_load_low_cpu_mem_usage(self):
pass
@require_torch
@require_sentencepiece

View File

@ -2431,7 +2431,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
torch_dtype = torch.float16 if (torch.cuda.is_available() or is_torch_xpu_available()) else torch.float32
model_id = "openai/whisper-large-v2"
model = WhisperForConditionalGeneration.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
model_id, torch_dtype=torch_dtype, use_safetensors=True
)
model.to(torch_device)
@ -2439,7 +2439,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
assistant_model_id = "distil-whisper/distil-large-v2"
assistant_model = WhisperForCausalLM.from_pretrained(
assistant_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
assistant_model_id, torch_dtype=torch_dtype, use_safetensors=True
)
assistant_model.to(torch_device)
@ -2481,7 +2481,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
torch_dtype = torch.float16 if torch_device in ["cuda", "xpu"] else torch.float32
model_id = "openai/whisper-large-v2"
model = WhisperForConditionalGeneration.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
model_id, torch_dtype=torch_dtype, use_safetensors=True
)
model.to(torch_device)
@ -2489,7 +2489,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
assistant_model_id = "openai/whisper-tiny"
assistant_model = WhisperForConditionalGeneration.from_pretrained(
assistant_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
assistant_model_id, torch_dtype=torch_dtype, use_safetensors=True
)
assistant_model.to(torch_device)

View File

@ -531,7 +531,6 @@ class ZambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
tmpdirname,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
low_cpu_mem_usage=True,
load_in_4bit=True,
)
@ -565,9 +564,7 @@ class ZambaModelIntegrationTest(unittest.TestCase):
@slow
def setUpClass(cls):
model_id = "Zyphra/Zamba-7B-v1"
cls.model = ZambaForCausalLM.from_pretrained(
model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, use_mamba_kernels=False
)
cls.model = ZambaForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, use_mamba_kernels=False)
cls.tokenizer = AutoTokenizer.from_pretrained(model_id)
@slow

View File

@ -549,7 +549,6 @@ class Zamba2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
tmpdirname,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
low_cpu_mem_usage=True,
load_in_4bit=True,
)
@ -610,9 +609,7 @@ class Zamba2ModelIntegrationTest(unittest.TestCase):
@slow
def setUpClass(cls):
model_id = "Zyphra/Zamba2-1.2B"
cls.model = Zamba2ForCausalLM.from_pretrained(
model_id, torch_dtype=torch.float32, low_cpu_mem_usage=True, revision="PR"
)
cls.model = Zamba2ForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, revision="PR")
cls.tokenizer = AutoTokenizer.from_pretrained(model_id, revision="PR")
@parameterized.expand([(torch_device,), ("cpu",)])

View File

@ -328,7 +328,6 @@ class AwqFusedTest(unittest.TestCase):
model = AutoModelForCausalLM.from_pretrained(
self.model_name,
quantization_config=quantization_config,
low_cpu_mem_usage=True,
revision=self.model_revision,
).to(torch_device)
@ -347,7 +346,6 @@ class AwqFusedTest(unittest.TestCase):
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=quantization_config,
low_cpu_mem_usage=True,
).to(torch_device)
# Check if model has been correctly fused
@ -370,7 +368,6 @@ class AwqFusedTest(unittest.TestCase):
model = AutoModelForCausalLM.from_pretrained(
self.model_name,
quantization_config=quantization_config,
low_cpu_mem_usage=True,
revision=self.model_revision,
).to(torch_device)
@ -399,7 +396,6 @@ class AwqFusedTest(unittest.TestCase):
model = AutoModelForCausalLM.from_pretrained(
self.model_name,
quantization_config=quantization_config,
low_cpu_mem_usage=True,
revision=self.model_revision,
).to(torch_device)

View File

@ -42,7 +42,6 @@ class HQQLLMRunner:
torch_dtype=compute_dtype,
device_map=device,
quantization_config=quant_config,
low_cpu_mem_usage=True,
cache_dir=cache_dir,
)
self.tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
@ -233,7 +232,9 @@ class HQQSerializationTest(unittest.TestCase):
# Load and check if the logits match
model_loaded = AutoModelForCausalLM.from_pretrained(
"quant_model", torch_dtype=torch.float16, device_map=torch_device, low_cpu_mem_usage=True
"quant_model",
torch_dtype=torch.float16,
device_map=torch_device,
)
with torch.no_grad():

View File

@ -578,87 +578,6 @@ class ModelTesterMixin:
f"The following keys are not properly handled by `_init_weights()`:\n{different_weights}",
)
@slow
@require_accelerate
@mark.accelerate_tests
def test_save_load_low_cpu_mem_usage(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
with tempfile.TemporaryDirectory() as saved_model_path:
for model_class in self.all_model_classes:
model_to_save = model_class(config)
model_to_save.save_pretrained(saved_model_path)
self._check_save_load_low_cpu_mem_usage(model_class, saved_model_path)
@slow
@require_accelerate
@mark.accelerate_tests
def test_save_load_low_cpu_mem_usage_checkpoints(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
with tempfile.TemporaryDirectory() as saved_model_path:
for model_class in self.all_model_classes:
model_to_save = model_class(config)
model_to_save.config.save_pretrained(saved_model_path)
torch.save(model_to_save.state_dict(), os.path.join(saved_model_path, "pytorch_model.bin"))
self._check_save_load_low_cpu_mem_usage(model_class, saved_model_path)
@slow
@require_accelerate
@mark.accelerate_tests
def test_save_load_low_cpu_mem_usage_no_safetensors(self):
with tempfile.TemporaryDirectory() as saved_model_path:
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
model_to_save = model_class(config)
model_to_save.save_pretrained(saved_model_path, safe_serialization=False)
self._check_save_load_low_cpu_mem_usage(model_class, saved_model_path)
def _check_save_load_low_cpu_mem_usage(self, model_class, saved_model_path):
from accelerate.utils.modeling import named_module_tensors
# Load the low usage and the normal models.
model_low_usage, loading_info = model_class.from_pretrained(
saved_model_path,
low_cpu_mem_usage=True,
output_loading_info=True,
)
model_non_low_usage = model_class.from_pretrained(saved_model_path)
# Check that there were no missing keys.
self.assertEqual(loading_info["missing_keys"], [])
# The low_cpu_mem_usage=True causes the model params to be initialized with device=meta, and then
# subsequently loaded with the correct values and onto the correct device. We check if there are any
# remaining params that were not properly loaded.
for name, tensor in named_module_tensors(model_low_usage, recurse=True):
self.assertNotEqual(
tensor.device,
torch.device("meta"),
"Tensor '" + name + "' has not been properly loaded and has device=meta.",
)
# Check that the parameters are equal.
for p1, p2 in zip(model_low_usage.parameters(), model_non_low_usage.parameters()):
self.assertEqual(p1.data.ne(p2.data).sum(), 0)
# Check that the state dict keys are equal.
self.assertEqual(set(model_low_usage.state_dict().keys()), set(model_non_low_usage.state_dict().keys()))
# Check that the shared tensors are equal.
tensor_ptrs1 = collections.defaultdict(list)
for name, tensor in model_low_usage.state_dict().items():
tensor_ptrs1[id_tensor_storage(tensor)].append(name)
tied_params1 = [names for _, names in tensor_ptrs1.items() if len(names) > 1]
tensor_ptrs2 = collections.defaultdict(list)
for name, tensor in model_non_low_usage.state_dict().items():
tensor_ptrs2[id_tensor_storage(tensor)].append(name)
tied_params2 = [names for _, names in tensor_ptrs2.items() if len(names) > 1]
self.assertEqual(tied_params1, tied_params2)
def test_torch_save_load(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
if config.__class__ not in MODEL_MAPPING:
@ -4100,7 +4019,6 @@ class ModelTesterMixin:
tmpdirname,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
low_cpu_mem_usage=True,
load_in_4bit=True,
)
@ -4173,7 +4091,6 @@ class ModelTesterMixin:
tmpdirname,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
low_cpu_mem_usage=True,
)
.to(torch_device)
.eval()
@ -4248,7 +4165,6 @@ class ModelTesterMixin:
tmpdirname,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
low_cpu_mem_usage=True,
)
.to(torch_device)
.eval()

View File

@ -64,7 +64,6 @@ from transformers.testing_utils import (
require_torch,
require_torch_accelerator,
require_torch_multi_accelerator,
require_usr_bin_time,
slow,
torch_device,
)
@ -1003,57 +1002,6 @@ class ModelUtilsTest(TestCasePlus):
self.assertIsNotNone(model)
@require_accelerate
@mark.accelerate_tests
def test_from_pretrained_low_cpu_mem_usage_functional(self):
# test that we can use `from_pretrained(..., low_cpu_mem_usage=True)` with normal and
# sharded models
mnames = [
"hf-internal-testing/tiny-random-bert-sharded",
"hf-internal-testing/tiny-random-bert",
]
for mname in mnames:
_ = BertModel.from_pretrained(mname, low_cpu_mem_usage=True)
@slow
@require_usr_bin_time
@require_accelerate
@mark.accelerate_tests
def test_from_pretrained_low_cpu_mem_usage_equal(self):
# Before this would test that `from_pretrained(..., low_cpu_mem_usage=True)` uses less cpu memory than default
# Now though these should be around the same.
# TODO: Look for good bounds to check that their timings are near the same
mname = "HuggingFaceTB/SmolLM-135M"
preamble = "from transformers import AutoModel"
one_liner_str = f'{preamble}; AutoModel.from_pretrained("{mname}", low_cpu_mem_usage=False)'
# Save this output as `max_rss_normal` if testing memory results
max_rss_normal = self.python_one_liner_max_rss(one_liner_str)
one_liner_str = f'{preamble}; AutoModel.from_pretrained("{mname}", low_cpu_mem_usage=True)'
# Save this output as `max_rss_low_mem` if testing memory results
max_rss_low_mem = self.python_one_liner_max_rss(one_liner_str)
# Should be within 5MBs of each other (overhead)
self.assertAlmostEqual(
max_rss_normal / 1024 / 1024,
max_rss_low_mem / 1024 / 1024,
delta=5,
msg="using `low_cpu_mem_usage` should incur the same memory usage in both cases.",
)
# if you want to compare things manually, let's first look at the size of the model in bytes
# model = AutoModel.from_pretrained(mname, low_cpu_mem_usage=False)
# total_numel = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
# total_bytes = total_numel * 4
# Now the diff_bytes should be very close to total_bytes, but the reports are inconsistent.
# The easiest way to test this is to switch the model and torch.load to do all the work on
# gpu - that way one can measure exactly the total and peak memory used. Perhaps once we add
# functionality to load models directly on gpu, this test can be rewritten to use torch's
# cuda memory tracking and then we should be able to do a much more precise test.
@require_accelerate
@mark.accelerate_tests
@require_torch_multi_accelerator
@ -1537,7 +1485,6 @@ class ModelUtilsTest(TestCasePlus):
config=model_config,
ignore_mismatched_sizes=True,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
)
model_ref = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id)
@ -1782,16 +1729,6 @@ class ModelUtilsTest(TestCasePlus):
)
self.assertTrue(check_models_equal(model, model_loaded))
def test_load_model_with_state_dict_only_low_cpu_mem_usage(self):
model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
state_dict = model.state_dict()
config = model.config
model_loaded = BertModel.from_pretrained(
pretrained_model_name_or_path=None, config=config, state_dict=state_dict, low_cpu_mem_usage=True
)
self.assertTrue(check_models_equal(model, model_loaded))
def test_cache_when_needed_at_train_time(self):
"""
Some fine-tuning methods require the use of cache, like prefix tuning in PEFT. This test checks that a cache