transformers/src/transformers/integrations/__init__.py
mobicham 59952994c4
Add HQQ quantization support (#29637)
* update HQQ transformers integration

* push import_utils.py

* add force_hooks check in modeling_utils.py

* fix | with Optional

* force bias as param

* check bias is Tensor

* force forward for multi-gpu

* review fixes pass

* remove torch grad()

* if any key in linear_tags fix

* add cpu/disk check

* isinstance return

* add multigpu test + refactor tests

* clean hqq_utils imports in hqq.py

* clean hqq_utils imports in quantizer_hqq.py

* delete hqq_utils.py

* Delete src/transformers/utils/hqq_utils.py

* ruff init

* remove torch.float16 from __init__ in test

* refactor test

* isinstance -> type in quantizer_hqq.py

* cpu/disk device_map check in quantizer_hqq.py

* remove type(module) nn.linear check in quantizer_hqq.py

* add BaseQuantizeConfig import inside HqqConfig init

* remove hqq import in hqq.py

* remove accelerate import from test_hqq.py

* quant config.py doc update

* add hqqconfig to main_classes doc

* make style

* __init__ fix

* ruff __init__

* skip_modules list

* hqqconfig format fix

* hqqconfig doc fix

* hqqconfig doc fix

* hqqconfig doc fix

* hqqconfig doc fix

* hqqconfig doc fix

* hqqconfig doc fix

* hqqconfig doc fix

* hqqconfig doc fix

* hqqconfig doc fix

* test_hqq.py remove mistral comment

* remove self.using_multi_gpu is False

* torch_dtype default val set and logger.info

* hqq.py isinstance fix

* remove torch=None

* torch_device test_hqq

* rename test_hqq

* MODEL_ID in test_hqq

* quantizer_hqq setattr fix

* quantizer_hqq typo fix

* imports quantizer_hqq.py

* isinstance quantizer_hqq

* hqq_layer.bias reformat quantizer_hqq

* Step 2 as comment in quantizer_hqq

* prepare_for_hqq_linear() comment

* keep_in_fp32_modules fix

* HqqHfQuantizer reformat

* quantization.md hqqconfig

* quantization.md model example reformat

* quantization.md # space

* quantization.md space   })

* quantization.md space   })

* quantization_config fix doc

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* axis value check in quantization_config

* format

* dynamic config explanation

* quant config method in quantization.md

* remove shard-level progress

* .cuda fix modeling_utils

* test_hqq fixes

* make fix-copies

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
2024-05-02 17:51:49 +01:00

163 lines
4.9 KiB
Python
Executable File

# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ..utils import _LazyModule
_import_structure = {
"aqlm": ["replace_with_aqlm_linear"],
"awq": [
"fuse_awq_modules",
"post_init_awq_exllama_modules",
"replace_with_awq_linear",
],
"bitsandbytes": [
"get_keys_to_not_convert",
"replace_8bit_linear",
"replace_with_bnb_linear",
"set_module_8bit_tensor_to_device",
"set_module_quantized_tensor_to_device",
],
"deepspeed": [
"HfDeepSpeedConfig",
"HfTrainerDeepSpeedConfig",
"deepspeed_config",
"deepspeed_init",
"deepspeed_load_checkpoint",
"deepspeed_optim_sched",
"is_deepspeed_available",
"is_deepspeed_zero3_enabled",
"set_hf_deepspeed_config",
"unset_hf_deepspeed_config",
],
"eetq": ["replace_with_eetq_linear"],
"hqq": ["prepare_for_hqq_linear"],
"integration_utils": [
"INTEGRATION_TO_CALLBACK",
"AzureMLCallback",
"ClearMLCallback",
"CodeCarbonCallback",
"CometCallback",
"DagsHubCallback",
"DVCLiveCallback",
"FlyteCallback",
"MLflowCallback",
"NeptuneCallback",
"NeptuneMissingConfiguration",
"TensorBoardCallback",
"WandbCallback",
"get_available_reporting_integrations",
"get_reporting_integration_callbacks",
"hp_params",
"is_azureml_available",
"is_clearml_available",
"is_codecarbon_available",
"is_comet_available",
"is_dagshub_available",
"is_dvclive_available",
"is_flyte_deck_standard_available",
"is_flytekit_available",
"is_mlflow_available",
"is_neptune_available",
"is_optuna_available",
"is_ray_available",
"is_ray_tune_available",
"is_sigopt_available",
"is_tensorboard_available",
"is_wandb_available",
"rewrite_logs",
"run_hp_search_optuna",
"run_hp_search_ray",
"run_hp_search_sigopt",
"run_hp_search_wandb",
],
"peft": ["PeftAdapterMixin"],
"quanto": ["replace_with_quanto_layers"],
}
if TYPE_CHECKING:
from .aqlm import replace_with_aqlm_linear
from .awq import (
fuse_awq_modules,
post_init_awq_exllama_modules,
replace_with_awq_linear,
)
from .bitsandbytes import (
get_keys_to_not_convert,
replace_8bit_linear,
replace_with_bnb_linear,
set_module_8bit_tensor_to_device,
set_module_quantized_tensor_to_device,
)
from .deepspeed import (
HfDeepSpeedConfig,
HfTrainerDeepSpeedConfig,
deepspeed_config,
deepspeed_init,
deepspeed_load_checkpoint,
deepspeed_optim_sched,
is_deepspeed_available,
is_deepspeed_zero3_enabled,
set_hf_deepspeed_config,
unset_hf_deepspeed_config,
)
from .eetq import replace_with_eetq_linear
from .hqq import prepare_for_hqq_linear
from .integration_utils import (
INTEGRATION_TO_CALLBACK,
AzureMLCallback,
ClearMLCallback,
CodeCarbonCallback,
CometCallback,
DagsHubCallback,
DVCLiveCallback,
FlyteCallback,
MLflowCallback,
NeptuneCallback,
NeptuneMissingConfiguration,
TensorBoardCallback,
WandbCallback,
get_available_reporting_integrations,
get_reporting_integration_callbacks,
hp_params,
is_azureml_available,
is_clearml_available,
is_codecarbon_available,
is_comet_available,
is_dagshub_available,
is_dvclive_available,
is_flyte_deck_standard_available,
is_flytekit_available,
is_mlflow_available,
is_neptune_available,
is_optuna_available,
is_ray_available,
is_ray_tune_available,
is_sigopt_available,
is_tensorboard_available,
is_wandb_available,
rewrite_logs,
run_hp_search_optuna,
run_hp_search_ray,
run_hp_search_sigopt,
run_hp_search_wandb,
)
from .peft import PeftAdapterMixin
from .quanto import replace_with_quanto_layers
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)