-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Update GPT-QModel references and deprecate AutoGPTQ #3190
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
fd3ec2d
d6c03d1
e32e477
493e753
98c4b50
dc4adcd
7253afc
76c09c2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -145,11 +145,11 @@ config = LoraConfig(target_modules="all-linear", ...) | |||||
|
|
||||||
| ## GPTQ quantization | ||||||
|
|
||||||
| You can learn more about gptq based `[2, 3, 4, 8]` bits quantization at [GPTQModel](https://github.com/ModelCloud/GPTQModel) and the Transformers [GPTQ](https://huggingface.co/docs/transformers/quantization/gptq) doc. Post-quant training, PEFT can use both [GPTQModel](https://github.com/ModelCloud/GPTQModel) or [AutoGPTQ](https://github.com/autogptq/autogptq) libraries, but we recommend GPTQModel because AutoGPTQ will be deprecated in a future release. | ||||||
| You can learn more about GPTQ-based `[2, 3, 4, 8]` bit quantization at [GPTQ-Model](https://github.com/ModelCloud/GPTQModel) and in the Transformers [GPTQ](https://huggingface.co/docs/transformers/quantization/gptq) documentation. PEFT supports GPTQ post-training through GPTQ-Model. | ||||||
|
Qubitium marked this conversation as resolved.
Outdated
|
||||||
|
|
||||||
| ```bash | ||||||
| # gptqmodel install | ||||||
| pip install gptqmodel --no-build-isolation | ||||||
| # GPTQ-Model install | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| pip install "gptqmodel>=6.0.3" --no-build-isolation | ||||||
| ``` | ||||||
|
|
||||||
| ```py | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -55,7 +55,7 @@ | |||||
| get_peft_model, | ||||||
| prepare_model_for_kbit_training, | ||||||
| ) | ||||||
| from peft.import_utils import is_bnb_4bit_available, is_bnb_available, is_gptqmodel_available, is_xpu_available | ||||||
| from peft.import_utils import is_bnb_4bit_available, is_bnb_available, is_xpu_available | ||||||
| from peft.tuners.lora.config import LoraRuntimeConfig | ||||||
| from peft.utils import infer_device | ||||||
|
|
||||||
|
|
@@ -70,9 +70,6 @@ | |||||
| ) | ||||||
|
|
||||||
|
|
||||||
| if is_gptqmodel_available(): | ||||||
| from gptqmodel import BACKEND | ||||||
|
|
||||||
| if is_bnb_available(): | ||||||
| import bitsandbytes as bnb | ||||||
|
|
||||||
|
|
@@ -522,14 +519,15 @@ def test_ia3_bnb_quantization_from_pretrained_safetensors(self, quantization): | |||||
|
|
||||||
| @require_gptqmodel | ||||||
| @pytest.mark.single_gpu_tests | ||||||
| @require_gptqmodel | ||||||
| def test_lora_gptq_quantization_from_pretrained_safetensors(self): | ||||||
| r""" | ||||||
| Tests that the autogptq quantization using LoRA works as expected with safetensors weights. | ||||||
| Tests that GPTQ-Model quantization using LoRA works as expected with safetensors weights. | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| """ | ||||||
| from transformers import GPTQConfig | ||||||
|
|
||||||
| model_id = "marcsun13/opt-350m-gptq-4bit" | ||||||
| quantization_config = GPTQConfig(bits=4, backend=BACKEND.AUTO_TRAINABLE) | ||||||
| quantization_config = GPTQConfig(bits=4) | ||||||
| # Use explicit device instead of "auto" to ensure model stays on single device | ||||||
| # This avoids device mismatch issues when reloading the model | ||||||
| device_map = f"{self.device}:0" # e.g., "cuda:0", "xpu:0" | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -52,7 +52,7 @@ | |||||
| @require_gptqmodel | ||||||
| class PeftGPTQModelCommonTests(unittest.TestCase): | ||||||
| r""" | ||||||
| A common tester to run common operations that are performed on GPU/CPU such as generation, loading in 8bit, etc. | ||||||
| A common tester to run GPTQ-Model operations that are performed on GPU/CPU such as generation and adapter loading. | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| """ | ||||||
|
|
||||||
| def setUp(self): | ||||||
|
|
@@ -69,12 +69,12 @@ def tearDown(self): | |||||
|
|
||||||
| def test_lora_gptq_quantization_from_pretrained_safetensors(self): | ||||||
| r""" | ||||||
| Tests that the gptqmodel quantization using LoRA works as expected with safetensors weights. | ||||||
| Tests that GPTQ-Model quantization using LoRA works as expected with safetensors weights. | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| """ | ||||||
| from transformers import GPTQConfig | ||||||
|
|
||||||
| model_id = "marcsun13/opt-350m-gptq-4bit" | ||||||
| quantization_config = GPTQConfig(bits=4, use_exllama=False) | ||||||
| quantization_config = GPTQConfig(bits=4) | ||||||
| kwargs = { | ||||||
| "pretrained_model_name_or_path": model_id, | ||||||
| "dtype": torch.float16, | ||||||
|
|
@@ -106,12 +106,12 @@ def test_lora_gptq_quantization_from_pretrained_safetensors(self): | |||||
|
|
||||||
| def test_oft_gptq_quantization_from_pretrained_safetensors(self): | ||||||
| r""" | ||||||
| Tests that the gptqmodel quantization using OFT works as expected with safetensors weights. | ||||||
| Tests that GPTQ-Model quantization using OFT works as expected with safetensors weights. | ||||||
|
Qubitium marked this conversation as resolved.
Outdated
|
||||||
| """ | ||||||
| from transformers import GPTQConfig | ||||||
|
|
||||||
| model_id = "marcsun13/opt-350m-gptq-4bit" | ||||||
| quantization_config = GPTQConfig(bits=4, use_exllama=False) | ||||||
| quantization_config = GPTQConfig(bits=4) | ||||||
| kwargs = { | ||||||
| "pretrained_model_name_or_path": model_id, | ||||||
| "dtype": torch.float16, | ||||||
|
|
@@ -146,14 +146,16 @@ def test_oft_gptq_quantization_from_pretrained_safetensors(self): | |||||
| @require_optimum | ||||||
| class PeftGPTQModelTests(unittest.TestCase): | ||||||
| r""" | ||||||
| GPTQ + peft tests | ||||||
| GPTQ-Model + PEFT tests | ||||||
|
Qubitium marked this conversation as resolved.
Outdated
|
||||||
| """ | ||||||
|
|
||||||
| def setUp(self): | ||||||
| from transformers import GPTQConfig | ||||||
| from transformers.utils.quantization_config import AwqBackend | ||||||
|
|
||||||
| self.causal_lm_model_id = "marcsun13/opt-350m-gptq-4bit" | ||||||
| self.quantization_config = GPTQConfig(bits=4, backend="auto_trainable") | ||||||
| # PEFT needs GPTQ-Model's trainable backend here rather than inference auto-selection. | ||||||
| self.quantization_config = GPTQConfig(bits=4, backend=AwqBackend.AUTO_TRAINABLE) | ||||||
| self.tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) | ||||||
|
|
||||||
| def tearDown(self): | ||||||
|
|
||||||
Uh oh!
There was an error while loading. Please reload this page.