huggingface · BenjaminBossan · Apr 29, 2026 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/docker/peft-gpu/Dockerfile b/docker/peft-gpu/Dockerfile
@@ -41,14 +41,12 @@ SHELL ["/bin/bash", "-c"]
 
 RUN conda run -n peft pip install --no-cache-dir bitsandbytes optimum
 
-# GPTQmodel doesn't find torch without build isolation
-#
 # Note: we are hard-coding CUDA_ARCH_LIST here since `gptqmodel` requires either nvidia-smi
 # or CUDA_ARCH_LIST for compute capability information. Since the docker build is unlikely
 # to have compute hardware available we use the information from the CI runner (which hosts
 # a NVIDIA L4). So we fix the compute capability to 8.9. In the future we might extend this
 # to a list of compute capabilities (separated by ;).
-RUN CUDA_ARCH_LIST=8.9 conda run -n peft pip install gptqmodel
+RUN CUDA_ARCH_LIST=8.9 conda run -n peft pip install "gptqmodel>=7.0.0"
 
 RUN \
     # Add eetq for quantization testing; needs to run without build isolation since the setup

diff --git a/docs/source/developer_guides/quantization.md b/docs/source/developer_guides/quantization.md
@@ -145,11 +145,11 @@ config = LoraConfig(target_modules="all-linear", ...)
 
 ## GPTQ quantization
 
-You can learn more about gptq based `[2, 3, 4, 8]` bits quantization at [GPTQModel](https://github.com/ModelCloud/GPTQModel) and the Transformers [GPTQ](https://huggingface.co/docs/transformers/quantization/gptq) doc. Post-quant training, PEFT can use both [GPTQModel](https://github.com/ModelCloud/GPTQModel) or [AutoGPTQ](https://github.com/autogptq/autogptq) libraries, but we recommend GPTQModel because AutoGPTQ will be deprecated in a future release.
+You can learn more about GPTQ-based `[2, 3, 4, 8]` bit quantization at [GPT-QModel](https://github.com/ModelCloud/GPTQModel) and in the Transformers [GPTQ](https://huggingface.co/docs/transformers/quantization/gptq) documentation. PEFT supports GPTQ post-training through GPT-QModel.
 
 ```bash
-# gptqmodel install
-pip install gptqmodel --no-build-isolation
+# GPT-QModel install
+pip install "gptqmodel>=7.0.0"
 ```
 
 ```py

diff --git a/examples/sft/README.md b/examples/sft/README.md
@@ -32,8 +32,8 @@ When you have access to multiple GPUs, it would be better to use normal LoRA wit
 Note: FSDP is currently not compatible with 8bit bitsandbytes quantization.
 
 
-## Multi-GPU SFT with LoRA and FSDP for GPTQModel:
-As in [Multi-GPU SFT with LoRA and FSDP](https://github.com/huggingface/peft/blob/main/examples/sft/README.md#multi-gpu-sft-with-lora-and-fsdp), we also support other quantization methods like GPTQModel. You may need to install [GPTQModel](https://github.com/ModelCloud/GPTQModel) > v3.0.0 or from source. Here is the launch command for reference: [run_peft_fsdp_gptq.sh]. For the `--model_name_or_path` argument, it is important to pass a model that is already quantized with GPTQModel, like `"hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"`.
+## Multi-GPU SFT with LoRA and FSDP for GPT-QModel:
+As in [Multi-GPU SFT with LoRA and FSDP](https://github.com/huggingface/peft/blob/main/examples/sft/README.md#multi-gpu-sft-with-lora-and-fsdp), we also support other quantization methods like GPT-QModel. You may need to install [GPT-QModel](https://github.com/ModelCloud/GPTQModel) >= v7.0.0 or from source. Here is the launch command for reference: [run_peft_fsdp_gptq.sh]. For the `--model_name_or_path` argument, it is important to pass a model that is already quantized with GPT-QModel, like `"hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"`.
 
 Note: there is a bug in transformers v4.53.0 for this case, please skip this transformers version.
 

diff --git a/src/peft/import_utils.py b/src/peft/import_utils.py
@@ -50,7 +50,7 @@ def is_bnb_4bit_available() -> bool:
 @lru_cache
 def is_gptqmodel_available():
     if importlib.util.find_spec("gptqmodel") is not None:
-        GPTQMODEL_MINIMUM_VERSION = packaging.version.parse("5.6.12")
+        GPTQMODEL_MINIMUM_VERSION = packaging.version.parse("7.0.0")
         OPTIMUM_MINIMUM_VERSION = packaging.version.parse("1.24.0")
         version_gptqmodel = packaging.version.parse(importlib_metadata.version("gptqmodel"))
         if GPTQMODEL_MINIMUM_VERSION <= version_gptqmodel:
@@ -79,7 +79,7 @@ def is_gptqmodel_available():
         else:
             raise ImportError(
                 f"Found an incompatible version of gptqmodel. Found version `{version_gptqmodel}`, "
-                f"but only versions above `{GPTQMODEL_MINIMUM_VERSION}` are supported"
+                f"but only versions `{GPTQMODEL_MINIMUM_VERSION}` or higher are supported"
             )
 
 

diff --git a/src/peft/tuners/lora/aqlm.py b/src/peft/tuners/lora/aqlm.py
@@ -82,12 +82,6 @@ def __repr__(self) -> str:
         rep = super().__repr__()
         return "lora." + rep
 
-    # TODO: Check if it is better as suggested by users https://github.com/PanQiWei/AutoGPTQ/pull/102
-    # def reset_lora_parameters(self, adapter_name):
-    #     if adapter_name in self.lora_A.keys():
-    #         torch.nn.init.xavier_uniform_(self.lora_A[adapter_name].weight)
-    #         torch.nn.init.zeros_(self.lora_B[adapter_name].weight)
-
 
 def dispatch_aqlm(
     target: torch.nn.Module,

diff --git a/src/peft/tuners/lora/awq.py b/src/peft/tuners/lora/awq.py
@@ -18,6 +18,7 @@
 from peft.import_utils import is_gptqmodel_available
 from peft.tuners.lora.layer import LoraLayer
 from peft.tuners.tuners_utils import BaseTunerLayer
+from peft.utils.other import is_gptqmodel_awq_layer
 
 from .config import LoraConfig
 
@@ -94,11 +95,8 @@ def dispatch_awq(
     else:
         target_base_layer = target
 
-    if is_gptqmodel_available():
-        from gptqmodel.nn_modules.qlinear.gemm_awq import AwqGEMMQuantLinear
-
-        if isinstance(target_base_layer, AwqGEMMQuantLinear):
-            new_module = AwqLoraLinear(target, adapter_name, config=config, **kwargs)
-            target.qweight = target_base_layer.qweight
+    if is_gptqmodel_available() and is_gptqmodel_awq_layer(target_base_layer):
+        new_module = AwqLoraLinear(target, adapter_name, config=config, **kwargs)
+        target.qweight = target_base_layer.qweight
 
     return new_module
diff --git a/src/peft/tuners/lora/gptq.py b/src/peft/tuners/lora/gptq.py
@@ -105,12 +105,6 @@ def __repr__(self) -> str:
         rep = super().__repr__()
         return "lora." + rep
 
-    # TODO: Check if it is better as suggested by users https://github.com/PanQiWei/AutoGPTQ/pull/102
-    # def reset_lora_parameters(self, adapter_name):
-    #     if adapter_name in self.lora_A.keys():
-    #         torch.nn.init.xavier_uniform_(self.lora_A[adapter_name].weight)
-    #         torch.nn.init.zeros_(self.lora_B[adapter_name].weight)
-
 
 def dispatch_gptq(
     target: torch.nn.Module,

diff --git a/src/peft/tuners/mixed/model.py b/src/peft/tuners/mixed/model.py
@@ -27,7 +27,6 @@
     ModulesToSaveWrapper,
     PeftType,
     _get_submodules,
-    get_gptqmodel_quant_linear,
 )
 from peft.utils.other import _set_adapter
 
@@ -175,8 +174,7 @@ def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
     @staticmethod
     def _create_new_module(config, adapter_name, target, **kwargs):
         gptq_quantization_config = kwargs.get("gptq_quantization_config", None)
-        GPTQQuantLinear = get_gptqmodel_quant_linear(gptq_quantization_config)
-        if (gptq_quantization_config is not None) or (GPTQQuantLinear is not None):
+        if gptq_quantization_config is not None:
             raise ValueError(f"GPTQ quantization not supported for {config.peft_type.value} (yet).")
 
         loaded_in_8bit = kwargs.pop("loaded_in_8bit", False)

diff --git a/src/peft/tuners/oft/awq.py b/src/peft/tuners/oft/awq.py
@@ -18,6 +18,7 @@
 from peft.import_utils import is_gptqmodel_available
 from peft.tuners.oft.layer import OFTLayer
 from peft.tuners.tuners_utils import BaseTunerLayer
+from peft.utils.other import is_gptqmodel_awq_layer
 
 from .config import OFTConfig
 
@@ -84,11 +85,8 @@ def dispatch_awq(
     else:
         target_base_layer = target
 
-    if is_gptqmodel_available():
-        from gptqmodel.nn_modules.qlinear.gemm_awq import AwqGEMMQuantLinear
-
-        if isinstance(target_base_layer, AwqGEMMQuantLinear):
-            new_module = AwqOFTLinear(target, adapter_name, **kwargs)
-            target.qweight = target_base_layer.qweight
+    if is_gptqmodel_available() and is_gptqmodel_awq_layer(target_base_layer):
+        new_module = AwqOFTLinear(target, adapter_name, **kwargs)
+        target.qweight = target_base_layer.qweight
 
     return new_module
diff --git a/src/peft/tuners/oft/layer.py b/src/peft/tuners/oft/layer.py
@@ -21,6 +21,7 @@
 import torch.nn.functional as F
 
 from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge
+from peft.utils.other import is_gptqmodel_quant_linear
 
 from .config import OFTConfig
 
@@ -358,8 +359,8 @@ def __init__(self, base_layer: nn.Module, **kwargs) -> None:
         elif hasattr(base_layer, "codebooks") and base_layer.__class__.__name__ == "QuantizedLinear":
             # AQLM QuantLinear
             in_features, out_features = base_layer.in_features, base_layer.out_features
-        elif hasattr(base_layer, "bits") and base_layer.__class__.__name__ == "AwqGEMMQuantLinear":
-            # Awq layers
+        elif is_gptqmodel_quant_linear(base_layer):
+            # GPT-QModel quantized linears
             in_features, out_features = base_layer.in_features, base_layer.out_features
         elif base_layer.__class__.__name__ == "EetqLinear":
             # Eetq layers

diff --git a/src/peft/tuners/tuners_utils.py b/src/peft/tuners/tuners_utils.py
@@ -49,6 +49,7 @@
     _get_module_names_tied_with_embedding,
     _set_adapter,
     _set_layer_requires_grad,
+    is_gptqmodel_quant_linear,
     match_target_against_key,
     set_additional_trainable_modules,
 )
@@ -208,8 +209,8 @@ def _get_in_out_features(module: nn.Module) -> tuple[int, int] | tuple[None, Non
     elif hasattr(module, "codebooks") and module.__class__.__name__ == "QuantizedLinear":
         # AQLM QuantLinear
         in_features, out_features = module.in_features, module.out_features
-    elif hasattr(module, "bits") and module.__class__.__name__ == "AwqGEMMQuantLinear":
-        # Awq layers
+    elif is_gptqmodel_quant_linear(module):
+        # GPT-QModel quantized linears
         in_features, out_features = module.in_features, module.out_features
     elif module.__class__.__name__ == "EetqLinear":
         # Eetq layers

diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
@@ -1296,6 +1296,32 @@ def get_quantization_config(model: torch.nn.Module, method: str):
     return None
 
 
+def is_gptqmodel_quant_linear(module: Optional[torch.nn.Module]) -> bool:
+    """
+    Check if a module is a GPT-QModel quantized linear.
+    """
+    if module is None or not is_gptqmodel_available():
+        return False
+
+    try:
+        from gptqmodel.nn_modules.qlinear import BaseQuantLinear
+    except ImportError:
+        return False
+
+    return isinstance(module, BaseQuantLinear)
+
+
+def is_gptqmodel_awq_layer(module: Optional[torch.nn.Module]) -> bool:
+    """
+    Check if a module is a GPT-QModel quantized linear that supports the AWQ method.
+    """
+    if not is_gptqmodel_quant_linear(module):
+        return False
+
+    supported_methods = getattr(module, "SUPPORTS_METHODS", [])
+    return any(method.value == "awq" for method in supported_methods)
+
+
 def get_gptqmodel_quant_linear(gptq_quantization_config, device_map=None):
     """
     Get the right GPTQQuantLinear class based on the quantization config file

diff --git a/tests/test_common_gpu.py b/tests/test_common_gpu.py
@@ -55,7 +55,7 @@
     get_peft_model,
     prepare_model_for_kbit_training,
 )
-from peft.import_utils import is_bnb_4bit_available, is_bnb_available, is_gptqmodel_available, is_xpu_available
+from peft.import_utils import is_bnb_4bit_available, is_bnb_available, is_xpu_available
 from peft.tuners.lora.config import LoraRuntimeConfig
 from peft.utils import infer_device
 
@@ -70,9 +70,6 @@
 )
 
 
-if is_gptqmodel_available():
-    from gptqmodel import BACKEND
-
 if is_bnb_available():
     import bitsandbytes as bnb
 
@@ -522,14 +519,15 @@ def test_ia3_bnb_quantization_from_pretrained_safetensors(self, quantization):
 
     @require_gptqmodel
     @pytest.mark.single_gpu_tests
+    @require_gptqmodel
     def test_lora_gptq_quantization_from_pretrained_safetensors(self):
         r"""
-        Tests that the autogptq quantization using LoRA works as expected with safetensors weights.
+        Tests that GPT-QModel quantization using LoRA works as expected with safetensors weights.
         """
         from transformers import GPTQConfig
 
         model_id = "marcsun13/opt-350m-gptq-4bit"
-        quantization_config = GPTQConfig(bits=4, backend=BACKEND.AUTO_TRAINABLE)
+        quantization_config = GPTQConfig(bits=4)
         # Use explicit device instead of "auto" to ensure model stays on single device
         # This avoids device mismatch issues when reloading the model
         device_map = f"{self.device}:0"  # e.g., "cuda:0", "xpu:0"

diff --git a/tests/test_gptqmodel.py b/tests/test_gptqmodel.py
@@ -52,7 +52,7 @@
 @require_gptqmodel
 class PeftGPTQModelCommonTests(unittest.TestCase):
     r"""
-    A common tester to run common operations that are performed on GPU/CPU such as generation, loading in 8bit, etc.
+    A common tester to run GPT-QModel operations that are performed on GPU/CPU such as generation and adapter loading.
     """
 
     def setUp(self):
@@ -69,12 +69,12 @@ def tearDown(self):
 
     def test_lora_gptq_quantization_from_pretrained_safetensors(self):
         r"""
-        Tests that the gptqmodel quantization using LoRA works as expected with safetensors weights.
+        Tests that GPT-QModel quantization using LoRA works as expected with safetensors weights.
         """
         from transformers import GPTQConfig
 
         model_id = "marcsun13/opt-350m-gptq-4bit"
-        quantization_config = GPTQConfig(bits=4, use_exllama=False)
+        quantization_config = GPTQConfig(bits=4)
         kwargs = {
             "pretrained_model_name_or_path": model_id,
             "dtype": torch.float16,
@@ -106,12 +106,12 @@ def test_lora_gptq_quantization_from_pretrained_safetensors(self):
 
     def test_oft_gptq_quantization_from_pretrained_safetensors(self):
         r"""
-        Tests that the gptqmodel quantization using OFT works as expected with safetensors weights.
+        Tests that GPT-QModel quantization using OFT works as expected with safetensors weights.
         """
         from transformers import GPTQConfig
 
         model_id = "marcsun13/opt-350m-gptq-4bit"
-        quantization_config = GPTQConfig(bits=4, use_exllama=False)
+        quantization_config = GPTQConfig(bits=4)
         kwargs = {
             "pretrained_model_name_or_path": model_id,
             "dtype": torch.float16,
@@ -146,14 +146,16 @@ def test_oft_gptq_quantization_from_pretrained_safetensors(self):
 @require_optimum
 class PeftGPTQModelTests(unittest.TestCase):
     r"""
-    GPTQ + peft tests
+    GPT-QModel + PEFT tests
     """
 
     def setUp(self):
         from transformers import GPTQConfig
+        from transformers.utils.quantization_config import AwqBackend
 
         self.causal_lm_model_id = "marcsun13/opt-350m-gptq-4bit"
-        self.quantization_config = GPTQConfig(bits=4, backend="auto_trainable")
+        # PEFT needs GPT-QModel's trainable backend here rather than inference auto-selection.
+        self.quantization_config = GPTQConfig(bits=4, backend=AwqBackend.AUTO_TRAINABLE)
         self.tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id)
 
     def tearDown(self):

diff --git a/tests/test_gpu_examples.py b/tests/test_gpu_examples.py
@@ -2203,7 +2203,7 @@ def tokenize(samples):
 @require_optimum
 class PeftGPTQGPUTests(unittest.TestCase):
     r"""
-    GPTQ + peft tests
+    GPT-QModel + PEFT tests
     """
 
     def setUp(self):