diff --git a/.github/workflows/test_gptq.yml b/.github/workflows/test_gptq.yml index ded8cad9ad..bfb022420c 100644 --- a/.github/workflows/test_gptq.yml +++ b/.github/workflows/test_gptq.yml @@ -44,7 +44,9 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip uv + uv pip install torch torchvision --index-url https://download.pytorch.org/whl/cu128 uv pip install .[tests] + uv pip install pypcre "setuptools>=78.1.1,<82" uv pip install "gptqmodel>=5.6.12" --no-build-isolation - name: Run tests diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index b561e2e4e5..713997abd1 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -47,7 +47,7 @@ from accelerate.hooks import remove_hook_from_module if is_gptqmodel_available(): - from gptqmodel import BACKEND, QuantizeConfig, exllama_set_max_input_length + from gptqmodel import BACKEND, QuantizeConfig from gptqmodel.quantization import FORMAT, GPTQ, METHOD from gptqmodel.utils.importer import hf_select_quant_linear_v2 from gptqmodel.utils.model import hf_convert_gptq_v1_to_v2_format, hf_convert_gptq_v2_to_v1_format @@ -669,8 +669,18 @@ class StoreAttr(object): model.quantize_config = StoreAttr() model.quantize_config.desc_act = self.desc_act model = gptq_post_init(model, use_act_order=self.desc_act) - if self.desc_act and self.backend == BACKEND.EXLLAMA_V1 and self.max_input_length is not None: + # Keep this compatibility guard for older gptqmodel versions where EXLLAMA_V1 still exists. + # This branch can be removed once we bump the minimum gptqmodel version and drop v1 support. + if ( + hasattr(BACKEND, "EXLLAMA_V1") + and self.backend == BACKEND.EXLLAMA_V1 + and self.desc_act + and self.max_input_length is not None + ): + from gptqmodel import exllama_set_max_input_length + model = exllama_set_max_input_length(model, self.max_input_length) + return model def pack_model(