diff --git a/docker/peft-gpu/Dockerfile b/docker/peft-gpu/Dockerfile index 88fbfa8405..c9d7fdb5bc 100644 --- a/docker/peft-gpu/Dockerfile +++ b/docker/peft-gpu/Dockerfile @@ -41,14 +41,12 @@ SHELL ["/bin/bash", "-c"] RUN conda run -n peft pip install --no-cache-dir bitsandbytes optimum -# GPTQmodel doesn't find torch without build isolation -# # Note: we are hard-coding CUDA_ARCH_LIST here since `gptqmodel` requires either nvidia-smi # or CUDA_ARCH_LIST for compute capability information. Since the docker build is unlikely # to have compute hardware available we use the information from the CI runner (which hosts # a NVIDIA L4). So we fix the compute capability to 8.9. In the future we might extend this # to a list of compute capabilities (separated by ;). -RUN CUDA_ARCH_LIST=8.9 conda run -n peft pip install gptqmodel +RUN CUDA_ARCH_LIST=8.9 conda run -n peft pip install "gptqmodel>=7.0.0" RUN \ # Add eetq for quantization testing; needs to run without build isolation since the setup diff --git a/docs/source/developer_guides/quantization.md b/docs/source/developer_guides/quantization.md index 3052aabf71..e6f75d76a2 100644 --- a/docs/source/developer_guides/quantization.md +++ b/docs/source/developer_guides/quantization.md @@ -145,11 +145,11 @@ config = LoraConfig(target_modules="all-linear", ...) ## GPTQ quantization -You can learn more about gptq based `[2, 3, 4, 8]` bits quantization at [GPTQModel](https://github.com/ModelCloud/GPTQModel) and the Transformers [GPTQ](https://huggingface.co/docs/transformers/quantization/gptq) doc. Post-quant training, PEFT can use both [GPTQModel](https://github.com/ModelCloud/GPTQModel) or [AutoGPTQ](https://github.com/autogptq/autogptq) libraries, but we recommend GPTQModel because AutoGPTQ will be deprecated in a future release. +You can learn more about GPTQ-based `[2, 3, 4, 8]` bit quantization at [GPT-QModel](https://github.com/ModelCloud/GPTQModel) and in the Transformers [GPTQ](https://huggingface.co/docs/transformers/quantization/gptq) documentation. PEFT supports GPTQ post-training through GPT-QModel. ```bash -# gptqmodel install -pip install gptqmodel --no-build-isolation +# GPT-QModel install +pip install "gptqmodel>=7.0.0" ``` ```py diff --git a/examples/sft/README.md b/examples/sft/README.md index abbf957d9b..3204a679a3 100644 --- a/examples/sft/README.md +++ b/examples/sft/README.md @@ -32,8 +32,8 @@ When you have access to multiple GPUs, it would be better to use normal LoRA wit Note: FSDP is currently not compatible with 8bit bitsandbytes quantization. -## Multi-GPU SFT with LoRA and FSDP for GPTQModel: -As in [Multi-GPU SFT with LoRA and FSDP](https://github.com/huggingface/peft/blob/main/examples/sft/README.md#multi-gpu-sft-with-lora-and-fsdp), we also support other quantization methods like GPTQModel. You may need to install [GPTQModel](https://github.com/ModelCloud/GPTQModel) > v3.0.0 or from source. Here is the launch command for reference: [run_peft_fsdp_gptq.sh]. For the `--model_name_or_path` argument, it is important to pass a model that is already quantized with GPTQModel, like `"hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"`. +## Multi-GPU SFT with LoRA and FSDP for GPT-QModel: +As in [Multi-GPU SFT with LoRA and FSDP](https://github.com/huggingface/peft/blob/main/examples/sft/README.md#multi-gpu-sft-with-lora-and-fsdp), we also support other quantization methods like GPT-QModel. You may need to install [GPT-QModel](https://github.com/ModelCloud/GPTQModel) >= v7.0.0 or from source. Here is the launch command for reference: [run_peft_fsdp_gptq.sh]. For the `--model_name_or_path` argument, it is important to pass a model that is already quantized with GPT-QModel, like `"hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"`. Note: there is a bug in transformers v4.53.0 for this case, please skip this transformers version. diff --git a/src/peft/import_utils.py b/src/peft/import_utils.py index dce71828b3..89f13d48d8 100644 --- a/src/peft/import_utils.py +++ b/src/peft/import_utils.py @@ -50,7 +50,7 @@ def is_bnb_4bit_available() -> bool: @lru_cache def is_gptqmodel_available(): if importlib.util.find_spec("gptqmodel") is not None: - GPTQMODEL_MINIMUM_VERSION = packaging.version.parse("5.6.12") + GPTQMODEL_MINIMUM_VERSION = packaging.version.parse("7.0.0") OPTIMUM_MINIMUM_VERSION = packaging.version.parse("1.24.0") version_gptqmodel = packaging.version.parse(importlib_metadata.version("gptqmodel")) if GPTQMODEL_MINIMUM_VERSION <= version_gptqmodel: @@ -79,7 +79,7 @@ def is_gptqmodel_available(): else: raise ImportError( f"Found an incompatible version of gptqmodel. Found version `{version_gptqmodel}`, " - f"but only versions above `{GPTQMODEL_MINIMUM_VERSION}` are supported" + f"but only versions `{GPTQMODEL_MINIMUM_VERSION}` or higher are supported" ) diff --git a/src/peft/tuners/lora/aqlm.py b/src/peft/tuners/lora/aqlm.py index fd7dec3991..31a5c2190a 100644 --- a/src/peft/tuners/lora/aqlm.py +++ b/src/peft/tuners/lora/aqlm.py @@ -82,12 +82,6 @@ def __repr__(self) -> str: rep = super().__repr__() return "lora." + rep - # TODO: Check if it is better as suggested by users https://github.com/PanQiWei/AutoGPTQ/pull/102 - # def reset_lora_parameters(self, adapter_name): - # if adapter_name in self.lora_A.keys(): - # torch.nn.init.xavier_uniform_(self.lora_A[adapter_name].weight) - # torch.nn.init.zeros_(self.lora_B[adapter_name].weight) - def dispatch_aqlm( target: torch.nn.Module, diff --git a/src/peft/tuners/lora/awq.py b/src/peft/tuners/lora/awq.py index 3311bba935..41f79330fd 100644 --- a/src/peft/tuners/lora/awq.py +++ b/src/peft/tuners/lora/awq.py @@ -18,6 +18,7 @@ from peft.import_utils import is_gptqmodel_available from peft.tuners.lora.layer import LoraLayer from peft.tuners.tuners_utils import BaseTunerLayer +from peft.utils.other import is_gptqmodel_awq_layer from .config import LoraConfig @@ -94,11 +95,8 @@ def dispatch_awq( else: target_base_layer = target - if is_gptqmodel_available(): - from gptqmodel.nn_modules.qlinear.gemm_awq import AwqGEMMQuantLinear - - if isinstance(target_base_layer, AwqGEMMQuantLinear): - new_module = AwqLoraLinear(target, adapter_name, config=config, **kwargs) - target.qweight = target_base_layer.qweight + if is_gptqmodel_available() and is_gptqmodel_awq_layer(target_base_layer): + new_module = AwqLoraLinear(target, adapter_name, config=config, **kwargs) + target.qweight = target_base_layer.qweight return new_module diff --git a/src/peft/tuners/lora/gptq.py b/src/peft/tuners/lora/gptq.py index d210e7e45d..30b8a7c63f 100644 --- a/src/peft/tuners/lora/gptq.py +++ b/src/peft/tuners/lora/gptq.py @@ -105,12 +105,6 @@ def __repr__(self) -> str: rep = super().__repr__() return "lora." + rep - # TODO: Check if it is better as suggested by users https://github.com/PanQiWei/AutoGPTQ/pull/102 - # def reset_lora_parameters(self, adapter_name): - # if adapter_name in self.lora_A.keys(): - # torch.nn.init.xavier_uniform_(self.lora_A[adapter_name].weight) - # torch.nn.init.zeros_(self.lora_B[adapter_name].weight) - def dispatch_gptq( target: torch.nn.Module, diff --git a/src/peft/tuners/mixed/model.py b/src/peft/tuners/mixed/model.py index 60a23cc929..8aad483cdb 100644 --- a/src/peft/tuners/mixed/model.py +++ b/src/peft/tuners/mixed/model.py @@ -27,7 +27,6 @@ ModulesToSaveWrapper, PeftType, _get_submodules, - get_gptqmodel_quant_linear, ) from peft.utils.other import _set_adapter @@ -175,8 +174,7 @@ def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None: @staticmethod def _create_new_module(config, adapter_name, target, **kwargs): gptq_quantization_config = kwargs.get("gptq_quantization_config", None) - GPTQQuantLinear = get_gptqmodel_quant_linear(gptq_quantization_config) - if (gptq_quantization_config is not None) or (GPTQQuantLinear is not None): + if gptq_quantization_config is not None: raise ValueError(f"GPTQ quantization not supported for {config.peft_type.value} (yet).") loaded_in_8bit = kwargs.pop("loaded_in_8bit", False) diff --git a/src/peft/tuners/oft/awq.py b/src/peft/tuners/oft/awq.py index 9289072bcd..78f30f9f0e 100644 --- a/src/peft/tuners/oft/awq.py +++ b/src/peft/tuners/oft/awq.py @@ -18,6 +18,7 @@ from peft.import_utils import is_gptqmodel_available from peft.tuners.oft.layer import OFTLayer from peft.tuners.tuners_utils import BaseTunerLayer +from peft.utils.other import is_gptqmodel_awq_layer from .config import OFTConfig @@ -84,11 +85,8 @@ def dispatch_awq( else: target_base_layer = target - if is_gptqmodel_available(): - from gptqmodel.nn_modules.qlinear.gemm_awq import AwqGEMMQuantLinear - - if isinstance(target_base_layer, AwqGEMMQuantLinear): - new_module = AwqOFTLinear(target, adapter_name, **kwargs) - target.qweight = target_base_layer.qweight + if is_gptqmodel_available() and is_gptqmodel_awq_layer(target_base_layer): + new_module = AwqOFTLinear(target, adapter_name, **kwargs) + target.qweight = target_base_layer.qweight return new_module diff --git a/src/peft/tuners/oft/layer.py b/src/peft/tuners/oft/layer.py index 108aebe1b0..8c90238f7e 100644 --- a/src/peft/tuners/oft/layer.py +++ b/src/peft/tuners/oft/layer.py @@ -21,6 +21,7 @@ import torch.nn.functional as F from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge +from peft.utils.other import is_gptqmodel_quant_linear from .config import OFTConfig @@ -358,8 +359,8 @@ def __init__(self, base_layer: nn.Module, **kwargs) -> None: elif hasattr(base_layer, "codebooks") and base_layer.__class__.__name__ == "QuantizedLinear": # AQLM QuantLinear in_features, out_features = base_layer.in_features, base_layer.out_features - elif hasattr(base_layer, "bits") and base_layer.__class__.__name__ == "AwqGEMMQuantLinear": - # Awq layers + elif is_gptqmodel_quant_linear(base_layer): + # GPT-QModel quantized linears in_features, out_features = base_layer.in_features, base_layer.out_features elif base_layer.__class__.__name__ == "EetqLinear": # Eetq layers diff --git a/src/peft/tuners/tuners_utils.py b/src/peft/tuners/tuners_utils.py index 8e424ba02c..a6be1a8e84 100644 --- a/src/peft/tuners/tuners_utils.py +++ b/src/peft/tuners/tuners_utils.py @@ -49,6 +49,7 @@ _get_module_names_tied_with_embedding, _set_adapter, _set_layer_requires_grad, + is_gptqmodel_quant_linear, match_target_against_key, set_additional_trainable_modules, ) @@ -208,8 +209,8 @@ def _get_in_out_features(module: nn.Module) -> tuple[int, int] | tuple[None, Non elif hasattr(module, "codebooks") and module.__class__.__name__ == "QuantizedLinear": # AQLM QuantLinear in_features, out_features = module.in_features, module.out_features - elif hasattr(module, "bits") and module.__class__.__name__ == "AwqGEMMQuantLinear": - # Awq layers + elif is_gptqmodel_quant_linear(module): + # GPT-QModel quantized linears in_features, out_features = module.in_features, module.out_features elif module.__class__.__name__ == "EetqLinear": # Eetq layers diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py index b46d3a3abd..086d9699ca 100644 --- a/src/peft/utils/other.py +++ b/src/peft/utils/other.py @@ -1296,6 +1296,32 @@ def get_quantization_config(model: torch.nn.Module, method: str): return None +def is_gptqmodel_quant_linear(module: Optional[torch.nn.Module]) -> bool: + """ + Check if a module is a GPT-QModel quantized linear. + """ + if module is None or not is_gptqmodel_available(): + return False + + try: + from gptqmodel.nn_modules.qlinear import BaseQuantLinear + except ImportError: + return False + + return isinstance(module, BaseQuantLinear) + + +def is_gptqmodel_awq_layer(module: Optional[torch.nn.Module]) -> bool: + """ + Check if a module is a GPT-QModel quantized linear that supports the AWQ method. + """ + if not is_gptqmodel_quant_linear(module): + return False + + supported_methods = getattr(module, "SUPPORTS_METHODS", []) + return any(method.value == "awq" for method in supported_methods) + + def get_gptqmodel_quant_linear(gptq_quantization_config, device_map=None): """ Get the right GPTQQuantLinear class based on the quantization config file diff --git a/tests/test_common_gpu.py b/tests/test_common_gpu.py index 91f7c0b0b1..72cbc6a60a 100644 --- a/tests/test_common_gpu.py +++ b/tests/test_common_gpu.py @@ -55,7 +55,7 @@ get_peft_model, prepare_model_for_kbit_training, ) -from peft.import_utils import is_bnb_4bit_available, is_bnb_available, is_gptqmodel_available, is_xpu_available +from peft.import_utils import is_bnb_4bit_available, is_bnb_available, is_xpu_available from peft.tuners.lora.config import LoraRuntimeConfig from peft.utils import infer_device @@ -70,9 +70,6 @@ ) -if is_gptqmodel_available(): - from gptqmodel import BACKEND - if is_bnb_available(): import bitsandbytes as bnb @@ -522,14 +519,15 @@ def test_ia3_bnb_quantization_from_pretrained_safetensors(self, quantization): @require_gptqmodel @pytest.mark.single_gpu_tests + @require_gptqmodel def test_lora_gptq_quantization_from_pretrained_safetensors(self): r""" - Tests that the autogptq quantization using LoRA works as expected with safetensors weights. + Tests that GPT-QModel quantization using LoRA works as expected with safetensors weights. """ from transformers import GPTQConfig model_id = "marcsun13/opt-350m-gptq-4bit" - quantization_config = GPTQConfig(bits=4, backend=BACKEND.AUTO_TRAINABLE) + quantization_config = GPTQConfig(bits=4) # Use explicit device instead of "auto" to ensure model stays on single device # This avoids device mismatch issues when reloading the model device_map = f"{self.device}:0" # e.g., "cuda:0", "xpu:0" diff --git a/tests/test_gptqmodel.py b/tests/test_gptqmodel.py index 16cdb90e07..67504d14f9 100644 --- a/tests/test_gptqmodel.py +++ b/tests/test_gptqmodel.py @@ -52,7 +52,7 @@ @require_gptqmodel class PeftGPTQModelCommonTests(unittest.TestCase): r""" - A common tester to run common operations that are performed on GPU/CPU such as generation, loading in 8bit, etc. + A common tester to run GPT-QModel operations that are performed on GPU/CPU such as generation and adapter loading. """ def setUp(self): @@ -69,12 +69,12 @@ def tearDown(self): def test_lora_gptq_quantization_from_pretrained_safetensors(self): r""" - Tests that the gptqmodel quantization using LoRA works as expected with safetensors weights. + Tests that GPT-QModel quantization using LoRA works as expected with safetensors weights. """ from transformers import GPTQConfig model_id = "marcsun13/opt-350m-gptq-4bit" - quantization_config = GPTQConfig(bits=4, use_exllama=False) + quantization_config = GPTQConfig(bits=4) kwargs = { "pretrained_model_name_or_path": model_id, "dtype": torch.float16, @@ -106,12 +106,12 @@ def test_lora_gptq_quantization_from_pretrained_safetensors(self): def test_oft_gptq_quantization_from_pretrained_safetensors(self): r""" - Tests that the gptqmodel quantization using OFT works as expected with safetensors weights. + Tests that GPT-QModel quantization using OFT works as expected with safetensors weights. """ from transformers import GPTQConfig model_id = "marcsun13/opt-350m-gptq-4bit" - quantization_config = GPTQConfig(bits=4, use_exllama=False) + quantization_config = GPTQConfig(bits=4) kwargs = { "pretrained_model_name_or_path": model_id, "dtype": torch.float16, @@ -146,14 +146,16 @@ def test_oft_gptq_quantization_from_pretrained_safetensors(self): @require_optimum class PeftGPTQModelTests(unittest.TestCase): r""" - GPTQ + peft tests + GPT-QModel + PEFT tests """ def setUp(self): from transformers import GPTQConfig + from transformers.utils.quantization_config import AwqBackend self.causal_lm_model_id = "marcsun13/opt-350m-gptq-4bit" - self.quantization_config = GPTQConfig(bits=4, backend="auto_trainable") + # PEFT needs GPT-QModel's trainable backend here rather than inference auto-selection. + self.quantization_config = GPTQConfig(bits=4, backend=AwqBackend.AUTO_TRAINABLE) self.tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) def tearDown(self): diff --git a/tests/test_gpu_examples.py b/tests/test_gpu_examples.py index ff1577b739..3b1ea37978 100644 --- a/tests/test_gpu_examples.py +++ b/tests/test_gpu_examples.py @@ -2203,7 +2203,7 @@ def tokenize(samples): @require_optimum class PeftGPTQGPUTests(unittest.TestCase): r""" - GPTQ + peft tests + GPT-QModel + PEFT tests """ def setUp(self):