Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker/peft-gpu/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ RUN conda run -n peft pip install --no-cache-dir bitsandbytes optimum
# to have compute hardware available we use the information from the CI runner (which hosts
# a NVIDIA L4). So we fix the compute capability to 8.9. In the future we might extend this
# to a list of compute capabilities (separated by ;).
RUN CUDA_ARCH_LIST=8.9 conda run -n peft pip install gptqmodel
RUN CUDA_ARCH_LIST=8.9 conda run -n peft pip install --no-build-isolation "gptqmodel>=6.0.3"
Comment thread
Qubitium marked this conversation as resolved.
Outdated

RUN \
# Add eetq for quantization testing; needs to run without build isolation since the setup
Expand Down
6 changes: 3 additions & 3 deletions docs/source/developer_guides/quantization.md
Original file line number Diff line number Diff line change
Expand Up @@ -145,11 +145,11 @@ config = LoraConfig(target_modules="all-linear", ...)

## GPTQ quantization

You can learn more about gptq based `[2, 3, 4, 8]` bits quantization at [GPTQModel](https://github.com/ModelCloud/GPTQModel) and the Transformers [GPTQ](https://huggingface.co/docs/transformers/quantization/gptq) doc. Post-quant training, PEFT can use both [GPTQModel](https://github.com/ModelCloud/GPTQModel) or [AutoGPTQ](https://github.com/autogptq/autogptq) libraries, but we recommend GPTQModel because AutoGPTQ will be deprecated in a future release.
You can learn more about GPTQ-based `[2, 3, 4, 8]` bit quantization at [GPTQ-Model](https://github.com/ModelCloud/GPTQModel) and in the Transformers [GPTQ](https://huggingface.co/docs/transformers/quantization/gptq) documentation. PEFT supports GPTQ post-training through GPTQ-Model.
Comment thread
Qubitium marked this conversation as resolved.
Outdated

```bash
# gptqmodel install
pip install gptqmodel --no-build-isolation
# GPTQ-Model install
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# GPTQ-Model install
# GPT-QModel install

pip install "gptqmodel>=6.0.3" --no-build-isolation
```

```py
Expand Down
2 changes: 1 addition & 1 deletion src/peft/import_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def is_bnb_4bit_available() -> bool:
@lru_cache
def is_gptqmodel_available():
if importlib.util.find_spec("gptqmodel") is not None:
GPTQMODEL_MINIMUM_VERSION = packaging.version.parse("5.6.12")
GPTQMODEL_MINIMUM_VERSION = packaging.version.parse("6.0.3")
OPTIMUM_MINIMUM_VERSION = packaging.version.parse("1.24.0")
version_gptqmodel = packaging.version.parse(importlib_metadata.version("gptqmodel"))
if GPTQMODEL_MINIMUM_VERSION <= version_gptqmodel:
Expand Down
2 changes: 1 addition & 1 deletion src/peft/tuners/lora/aqlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def __repr__(self) -> str:
rep = super().__repr__()
return "lora." + rep

# TODO: Check if it is better as suggested by users https://github.com/PanQiWei/AutoGPTQ/pull/102
# TODO: Check whether an alternative initialization would improve AQLM-backed LoRA layers.
Comment thread
Qubitium marked this conversation as resolved.
Outdated
# def reset_lora_parameters(self, adapter_name):
# if adapter_name in self.lora_A.keys():
# torch.nn.init.xavier_uniform_(self.lora_A[adapter_name].weight)
Expand Down
10 changes: 4 additions & 6 deletions src/peft/tuners/lora/awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from peft.import_utils import is_gptqmodel_available
from peft.tuners.lora.layer import LoraLayer
from peft.tuners.tuners_utils import BaseTunerLayer
from peft.utils.other import is_gptqmodel_awq_layer

from .config import LoraConfig

Expand Down Expand Up @@ -94,11 +95,8 @@ def dispatch_awq(
else:
target_base_layer = target

if is_gptqmodel_available():
from gptqmodel.nn_modules.qlinear.gemm_awq import AwqGEMMQuantLinear

if isinstance(target_base_layer, AwqGEMMQuantLinear):
new_module = AwqLoraLinear(target, adapter_name, config=config, **kwargs)
target.qweight = target_base_layer.qweight
if is_gptqmodel_available() and is_gptqmodel_awq_layer(target_base_layer):
new_module = AwqLoraLinear(target, adapter_name, config=config, **kwargs)
target.qweight = target_base_layer.qweight

return new_module
2 changes: 1 addition & 1 deletion src/peft/tuners/lora/gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def __repr__(self) -> str:
rep = super().__repr__()
return "lora." + rep

# TODO: Check if it is better as suggested by users https://github.com/PanQiWei/AutoGPTQ/pull/102
# TODO: Check whether an alternative initialization is better for GPTQ-Model-backed LoRA layers.
Comment thread
Qubitium marked this conversation as resolved.
Outdated
# def reset_lora_parameters(self, adapter_name):
# if adapter_name in self.lora_A.keys():
# torch.nn.init.xavier_uniform_(self.lora_A[adapter_name].weight)
Expand Down
4 changes: 1 addition & 3 deletions src/peft/tuners/mixed/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
ModulesToSaveWrapper,
PeftType,
_get_submodules,
get_gptqmodel_quant_linear,
)
from peft.utils.other import _set_adapter

Expand Down Expand Up @@ -175,8 +174,7 @@ def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
@staticmethod
def _create_new_module(config, adapter_name, target, **kwargs):
gptq_quantization_config = kwargs.get("gptq_quantization_config", None)
GPTQQuantLinear = get_gptqmodel_quant_linear(gptq_quantization_config)
if (gptq_quantization_config is not None) or (GPTQQuantLinear is not None):
if gptq_quantization_config is not None:
raise ValueError(f"GPTQ quantization not supported for {config.peft_type.value} (yet).")

loaded_in_8bit = kwargs.pop("loaded_in_8bit", False)
Expand Down
10 changes: 4 additions & 6 deletions src/peft/tuners/oft/awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from peft.import_utils import is_gptqmodel_available
from peft.tuners.oft.layer import OFTLayer
from peft.tuners.tuners_utils import BaseTunerLayer
from peft.utils.other import is_gptqmodel_awq_layer

from .config import OFTConfig

Expand Down Expand Up @@ -84,11 +85,8 @@ def dispatch_awq(
else:
target_base_layer = target

if is_gptqmodel_available():
from gptqmodel.nn_modules.qlinear.gemm_awq import AwqGEMMQuantLinear

if isinstance(target_base_layer, AwqGEMMQuantLinear):
new_module = AwqOFTLinear(target, adapter_name, **kwargs)
target.qweight = target_base_layer.qweight
if is_gptqmodel_available() and is_gptqmodel_awq_layer(target_base_layer):
new_module = AwqOFTLinear(target, adapter_name, **kwargs)
target.qweight = target_base_layer.qweight

return new_module
5 changes: 3 additions & 2 deletions src/peft/tuners/oft/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import torch.nn.functional as F

from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge
from peft.utils.other import is_gptqmodel_quant_linear

from .config import OFTConfig

Expand Down Expand Up @@ -358,8 +359,8 @@ def __init__(self, base_layer: nn.Module, **kwargs) -> None:
elif hasattr(base_layer, "codebooks") and base_layer.__class__.__name__ == "QuantizedLinear":
# AQLM QuantLinear
in_features, out_features = base_layer.in_features, base_layer.out_features
elif hasattr(base_layer, "bits") and base_layer.__class__.__name__ == "AwqGEMMQuantLinear":
# Awq layers
elif is_gptqmodel_quant_linear(base_layer):
# GPTQ-Model quantized linears
in_features, out_features = base_layer.in_features, base_layer.out_features
elif base_layer.__class__.__name__ == "EetqLinear":
# Eetq layers
Expand Down
5 changes: 3 additions & 2 deletions src/peft/tuners/tuners_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
_get_module_names_tied_with_embedding,
_set_adapter,
_set_layer_requires_grad,
is_gptqmodel_quant_linear,
match_target_against_key,
set_additional_trainable_modules,
)
Expand Down Expand Up @@ -208,8 +209,8 @@ def _get_in_out_features(module: nn.Module) -> tuple[int, int] | tuple[None, Non
elif hasattr(module, "codebooks") and module.__class__.__name__ == "QuantizedLinear":
# AQLM QuantLinear
in_features, out_features = module.in_features, module.out_features
elif hasattr(module, "bits") and module.__class__.__name__ == "AwqGEMMQuantLinear":
# Awq layers
elif is_gptqmodel_quant_linear(module):
# GPTQ-Model quantized linears
in_features, out_features = module.in_features, module.out_features
elif module.__class__.__name__ == "EetqLinear":
# Eetq layers
Expand Down
26 changes: 26 additions & 0 deletions src/peft/utils/other.py
Original file line number Diff line number Diff line change
Expand Up @@ -1296,6 +1296,32 @@ def get_quantization_config(model: torch.nn.Module, method: str):
return None


def is_gptqmodel_quant_linear(module: Optional[torch.nn.Module]) -> bool:
"""
Check if a module is a GPTQ-Model quantized linear.
"""
if module is None or not is_gptqmodel_available():
return False

try:
from gptqmodel.nn_modules.qlinear import BaseQuantLinear
except ImportError:
return False

return isinstance(module, BaseQuantLinear)


def is_gptqmodel_awq_layer(module: Optional[torch.nn.Module]) -> bool:
"""
Check if a module is a GPTQ-Model quantized linear that supports the AWQ method.
Comment thread
Qubitium marked this conversation as resolved.
Outdated
"""
if not is_gptqmodel_quant_linear(module):
return False

supported_methods = getattr(module, "SUPPORTS_METHODS", None) or ()
return any(getattr(method, "value", method) == "awq" for method in supported_methods)
Comment thread
Qubitium marked this conversation as resolved.
Outdated


def get_gptqmodel_quant_linear(gptq_quantization_config, device_map=None):
"""
Get the right GPTQQuantLinear class based on the quantization config file
Expand Down
10 changes: 4 additions & 6 deletions tests/test_common_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
get_peft_model,
prepare_model_for_kbit_training,
)
from peft.import_utils import is_bnb_4bit_available, is_bnb_available, is_gptqmodel_available, is_xpu_available
from peft.import_utils import is_bnb_4bit_available, is_bnb_available, is_xpu_available
from peft.tuners.lora.config import LoraRuntimeConfig
from peft.utils import infer_device

Expand All @@ -70,9 +70,6 @@
)


if is_gptqmodel_available():
from gptqmodel import BACKEND

if is_bnb_available():
import bitsandbytes as bnb

Expand Down Expand Up @@ -522,14 +519,15 @@ def test_ia3_bnb_quantization_from_pretrained_safetensors(self, quantization):

@require_gptqmodel
@pytest.mark.single_gpu_tests
@require_gptqmodel
def test_lora_gptq_quantization_from_pretrained_safetensors(self):
r"""
Tests that the autogptq quantization using LoRA works as expected with safetensors weights.
Tests that GPTQ-Model quantization using LoRA works as expected with safetensors weights.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Tests that GPTQ-Model quantization using LoRA works as expected with safetensors weights.
Tests that GPT-QModel quantization using LoRA works as expected with safetensors weights.

"""
from transformers import GPTQConfig

model_id = "marcsun13/opt-350m-gptq-4bit"
quantization_config = GPTQConfig(bits=4, backend=BACKEND.AUTO_TRAINABLE)
quantization_config = GPTQConfig(bits=4)
# Use explicit device instead of "auto" to ensure model stays on single device
# This avoids device mismatch issues when reloading the model
device_map = f"{self.device}:0" # e.g., "cuda:0", "xpu:0"
Expand Down
16 changes: 9 additions & 7 deletions tests/test_gptqmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
@require_gptqmodel
class PeftGPTQModelCommonTests(unittest.TestCase):
r"""
A common tester to run common operations that are performed on GPU/CPU such as generation, loading in 8bit, etc.
A common tester to run GPTQ-Model operations that are performed on GPU/CPU such as generation and adapter loading.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
A common tester to run GPTQ-Model operations that are performed on GPU/CPU such as generation and adapter loading.
A common tester to run GPT-QModel operations that are performed on GPU/CPU such as generation and adapter loading.

"""

def setUp(self):
Expand All @@ -69,12 +69,12 @@ def tearDown(self):

def test_lora_gptq_quantization_from_pretrained_safetensors(self):
r"""
Tests that the gptqmodel quantization using LoRA works as expected with safetensors weights.
Tests that GPTQ-Model quantization using LoRA works as expected with safetensors weights.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Tests that GPTQ-Model quantization using LoRA works as expected with safetensors weights.
Tests that GPT-QModel quantization using LoRA works as expected with safetensors weights.

"""
from transformers import GPTQConfig

model_id = "marcsun13/opt-350m-gptq-4bit"
quantization_config = GPTQConfig(bits=4, use_exllama=False)
quantization_config = GPTQConfig(bits=4)
kwargs = {
"pretrained_model_name_or_path": model_id,
"dtype": torch.float16,
Expand Down Expand Up @@ -106,12 +106,12 @@ def test_lora_gptq_quantization_from_pretrained_safetensors(self):

def test_oft_gptq_quantization_from_pretrained_safetensors(self):
r"""
Tests that the gptqmodel quantization using OFT works as expected with safetensors weights.
Tests that GPTQ-Model quantization using OFT works as expected with safetensors weights.
Comment thread
Qubitium marked this conversation as resolved.
Outdated
"""
from transformers import GPTQConfig

model_id = "marcsun13/opt-350m-gptq-4bit"
quantization_config = GPTQConfig(bits=4, use_exllama=False)
quantization_config = GPTQConfig(bits=4)
kwargs = {
"pretrained_model_name_or_path": model_id,
"dtype": torch.float16,
Expand Down Expand Up @@ -146,14 +146,16 @@ def test_oft_gptq_quantization_from_pretrained_safetensors(self):
@require_optimum
class PeftGPTQModelTests(unittest.TestCase):
r"""
GPTQ + peft tests
GPTQ-Model + PEFT tests
Comment thread
Qubitium marked this conversation as resolved.
Outdated
"""

def setUp(self):
from transformers import GPTQConfig
from transformers.utils.quantization_config import AwqBackend

self.causal_lm_model_id = "marcsun13/opt-350m-gptq-4bit"
self.quantization_config = GPTQConfig(bits=4, backend="auto_trainable")
# PEFT needs GPTQ-Model's trainable backend here rather than inference auto-selection.
self.quantization_config = GPTQConfig(bits=4, backend=AwqBackend.AUTO_TRAINABLE)
self.tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id)

def tearDown(self):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_gpu_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -2203,7 +2203,7 @@ def tokenize(samples):
@require_optimum
class PeftGPTQGPUTests(unittest.TestCase):
r"""
GPTQ + peft tests
GPTQ-Model + PEFT tests
Comment thread
Qubitium marked this conversation as resolved.
Outdated
"""

def setUp(self):
Expand Down