microsoft · poganesh · May 23, 2026 · May 27, 2026 · May 29, 2026 · May 29, 2026
diff --git a/Qwen-Qwen1.5-7B-Chat/RyzenAI/Qwen1.5-7B-Chat_quark_ryzenai_llm.json b/Qwen-Qwen1.5-7B-Chat/RyzenAI/Qwen1.5-7B-Chat_quark_ryzenai_llm.json
@@ -0,0 +1,23 @@
+{
+    "input_model": { "type": "HFModel", "model_path": "Qwen/Qwen1.5-7B-Chat" },
+    "passes": {
+        "qq": {
+            "type": "QuarkQuantization",
+            "quant_scheme": "uint4_wo_128",
+            "quant_algo": "awq",
+            "dataset": "pileval_for_awq_benchmark",
+            "data_type": "bfloat16",
+            "num_calib_data": 128,
+            "model_export": ["hf_format"],
+            "exclude_layers": []
+        },
+        "mg": {
+            "type": "RyzenGenerateModelLLM",
+            "recipe": "token_fusion"
+        }
+    },
+    "log_severity_level": 1,
+    "output_dir": "models/Qwen1.5-7B-Chat-rai",
+    "cache_dir": "olive_cache",
+    "no_artifacts": true
+}
diff --git a/Qwen-Qwen1.5-7B-Chat/RyzenAI/README.md b/Qwen-Qwen1.5-7B-Chat/RyzenAI/README.md
@@ -0,0 +1,88 @@
+# Model Optimization and Quantization for AMD NPU
+This folder contains sample Olive configuration to optimize Qwen models for AMD NPU.
+
+## ✅ Supported Models and Configs
+
+| Model Name (Hugging Face)          | Config File Name                |
+| :--------------------------------- | :------------------------------ |
+| `Qwen/Qwen1.5-7B-Chat`       | `Qwen1.5-7B-Chat_quark_ryzenai_llm.json` |
+
+## **Run the Quantization Config**
+
+### **Quark quantization**
+
+For LLMs - follow the below commands to generate the optimized model for RyzenAI Execution Provider.
+
+**Platform Support:**
+- ✅ **Windows with CUDA** - Supported
+- ✅ **Windows with CPU** - Supported
+- ⏳ **Planned for future release:** Linux with ROCm, Linux with CUDA, Windows with ROCm
+
+For more details about quark, see the [Quark Documentation](https://quark.docs.amd.com/latest/)
+
+#### **Create a Python 3.12 conda environment and run the below commands**
+```bash
+conda create -n olive python=3.12
+conda activate olive
+```
+
+#### **Install Olive**
+
+**Option 1: Install from PyPI**
+```bash
+pip install olive-ai[auto-opt]
+pip install transformers onnxruntime-genai
+```
+
+**Option 2: Install from source**
+```bash
+git clone https://github.com/microsoft/Olive.git
+cd Olive
+pip install -e .
+pip install -r requirements.txt
+```
+
+#### **Install RyzenAI LLM dependencies**
+
+```bash
+cd olive-recipes/Qwen-Qwen1.5-7B-Chat/RyzenAI
+pip install --force-reinstall -r requirements_ryzenai_llm.txt
+```
+
+
+
+#### **Install PyTorch**
+
+Make sure to install the correct version of PyTorch before running quantization:
+
+**For AMD GPUs (ROCm):**
+```bash
+pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1
+
+python -c "import torch; print(torch.cuda.is_available())" # Must return `True`
+```
+
+**For NVIDIA GPUs (CUDA):**
+```bash
+pip install torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu128
+
+python -c "import torch; print(torch.cuda.is_available())" # Must return `True`
+```
+
+**For CPU-only (Windows):**
+```bash
+pip install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+python -c "import torch; print(torch.__version__)"  # Should print 2.7.0+cpu
+```
+
+#### **Generate optimized LLM model for RyzenAI NPU**
+Follow the above setup instructions, then run the below command to generate the optimized LLM model for RyzenAI EP
+
+```bash
+# Qwen1.5-7B-Chat
+olive run --config Qwen1.5-7B-Chat_quark_ryzenai_llm.json
+```
+
+✅ Optimized model saved in: `models/Qwen1.5-7B-Chat-rai/`
+
+> **Note:** Output model is saved in `output_dir` mentioned in the json files.
diff --git a/Qwen-Qwen1.5-7B-Chat/RyzenAI/info.yaml b/Qwen-Qwen1.5-7B-Chat/RyzenAI/info.yaml
@@ -0,0 +1,6 @@
+arch: qwen2
+recipes:
+  - name: Qwen1.5-7B-Chat_RyzenAI
+    file: Qwen1.5-7B-Chat_quark_ryzenai_llm.json
+    devices: npu
+    eps: RyzenAIExecutionProvider
diff --git a/...2-7B/VitisAI/requirements_vitisai_llm.txt → ...Chat/RyzenAI/requirements_ryzenai_llm.txt b/...2-7B/VitisAI/requirements_vitisai_llm.txt → ...Chat/RyzenAI/requirements_ryzenai_llm.txt
@@ -1,4 +1,4 @@
---extra-index-url=https://pypi.amd.com/olive/1.7.1-5D/simple
+--extra-index-url=https://pypi.amd.com/olive/1.7.1-6D/simple/
 # AMD model generation
 --extra-index-url=https://pypi.amd.com/simple
 accelerate

diff --git a/Qwen-Qwen1.5-7B-Chat/VitisAI/Qwen1.5-7B-Chat_quark_vitisai_llm.json b/Qwen-Qwen1.5-7B-Chat/VitisAI/Qwen1.5-7B-Chat_quark_vitisai_llm.json
@@ -2,22 +2,39 @@
     "input_model": { "type": "HFModel", "model_path": "Qwen/Qwen1.5-7B-Chat" },
     "passes": {
         "qq": {
-            "type": "QuarkQuantization",
-            "quant_scheme": "uint4_wo_128",
+            "type": "QuarkQuantizationVitisAI",
+            "quant_scheme": "w_uint4_per_group_asym",
             "quant_algo": "awq",
             "dataset": "pileval_for_awq_benchmark",
             "data_type": "bfloat16",
             "num_calib_data": 128,
-            "model_export": ["hf_format"],
-            "exclude_layers": []
+            "model_export": [ "hf_format" ],
+            "exclude_layers": [  ],
+            "quant_config": {
+                "name": "awq",
+                "scaling_layers": [
+                    {
+                        "prev_op": "input_layernorm",
+                        "layers": [ "self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj" ],
+                        "inp": "self_attn.q_proj",
+                        "module2inspect": "self_attn"
+                    },
+                    { "prev_op": "self_attn.v_proj", "layers": [ "self_attn.o_proj" ], "inp": "self_attn.o_proj" },
+                    {
+                        "prev_op": "post_attention_layernorm",
+                        "layers": [ "mlp.gate_proj", "mlp.up_proj" ],
+                        "inp": "mlp.gate_proj",
+                        "module2inspect": "mlp"
+                    },
+                    { "prev_op": "mlp.up_proj", "layers": [ "mlp.down_proj" ], "inp": "mlp.down_proj" }
+                ],
+                "model_decoder_layers": "model.layers"
+            }
         },
-        "mg": {
-            "type": "VitisGenerateModelLLM",
-            "recipe": "full_fusion"
-        }
+        "mg": { "type": "VitisGenerateModelLLM", "packed_const": false, "cpu_only": false }
     },
     "log_severity_level": 1,
     "output_dir": "models/Qwen1.5-7B-Chat-vai",
-    "cache_dir": "olive_cache",
+    "cache_dir": "cache",
     "no_artifacts": true
 }
diff --git a/Qwen-Qwen1.5-7B-Chat/VitisAI/README.md b/Qwen-Qwen1.5-7B-Chat/VitisAI/README.md
@@ -14,29 +14,21 @@ This folder contains sample Olive configuration to optimize Qwen models for AMD
 For LLMs - follow the below commands to generate the optimized model for VitisAI Execution Provider.
 
 **Platform Support:**
+- ✅ **Linux with ROCm** - Supported
+- ✅ **Linux with CUDA** - Supported
 - ✅ **Windows with CUDA** - Supported
-- ✅ **Windows with CPU** - Supported
-- ⏳ **Planned for future release:** Linux with ROCm, Linux with CUDA, Windows with ROCm
+- ✅ **Windows with CPU** - Supported (quantization will be slower)
+- ⏳ **Windows with ROCm** - Planned for future release
 
 For more details about quark, see the [Quark Documentation](https://quark.docs.amd.com/latest/)
 
-#### **Create a Python 3.12 conda environment and run the below commands**
+#### **Create a Python 3.10 conda environment and run the below commands**
 ```bash
-conda create -n olive python=3.12
+conda create -n olive python=3.10
 conda activate olive
 ```
 
-#### **Install Olive**
-
-**Option 1: Install from PyPI**
 ```bash
-pip install olive-ai[auto-opt]
-pip install transformers onnxruntime-genai
-```
-
-**Option 2: Install from source**
-```bash
-git clone https://github.com/microsoft/Olive.git
 cd Olive
 pip install -e .
 pip install -r requirements.txt
@@ -49,7 +41,7 @@ cd olive-recipes/Qwen-Qwen1.5-7B-Chat/VitisAI
 pip install --force-reinstall -r requirements_vitisai_llm.txt
 ```
 
-
+**Note:** The requirements file automatically installs the correct `model-generate` version for your platform (1.5.0 for Linux, 1.5.1 for Windows).
 
 #### **Install PyTorch**
 
@@ -68,13 +60,6 @@ pip install torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 --index-url https
 
 python -c "import torch; print(torch.cuda.is_available())" # Must return `True`
 ```
-
-**For CPU-only (Windows):**
-```bash
-pip install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-python -c "import torch; print(torch.__version__)"  # Should print 2.7.0+cpu
-```
-
 #### **Generate optimized LLM model for VitisAI NPU**
 Follow the above setup instructions, then run the below command to generate the optimized LLM model for VitisAI EP
 

diff --git a/Qwen-Qwen1.5-7B-Chat/VitisAI/requirements_vitisai_llm.txt b/Qwen-Qwen1.5-7B-Chat/VitisAI/requirements_vitisai_llm.txt
@@ -1,27 +1,26 @@
---extra-index-url=https://pypi.amd.com/olive/1.7.1-5D/simple
 # AMD model generation
 --extra-index-url=https://pypi.amd.com/simple
 accelerate
 
 # Quark
-amd-quark==0.11
+amd-quark==0.9
 datasets
 evaluate
 
-model-generate
+# Platform-specific model-generate versions:
+# Linux: use model-generate==1.5.0 (default)
+# Windows: MUST use model-generate==1.5.1
+model-generate==1.5.0; sys_platform != 'win32'
+model-generate==1.5.1; sys_platform == 'win32'
 
 nltk
-numpy==1.26.4
+numpy
 
 # Pin onnx version
 onnx==1.18.0
-onnxruntime
-onnxruntime-genai
-onnxsim
+onnxruntime==1.21.1
+onnxruntime-genai==0.7.1
 optimum
-
-ryzenai-dynamic-dispatch
-ryzenai-onnx-utils
 sentencepiece
 tabulate
-transformers==4.57.6
+transformers==4.50.0
diff --git a/...VitisAI/Qwen2-1.5B_quark_vitisai_llm.json → ...RyzenAI/Qwen2-1.5B_quark_ryzenai_llm.json b/...VitisAI/Qwen2-1.5B_quark_vitisai_llm.json → ...RyzenAI/Qwen2-1.5B_quark_ryzenai_llm.json
@@ -12,12 +12,12 @@
             "exclude_layers": []
         },
         "mg": {
-            "type": "VitisGenerateModelLLM",
-            "recipe": "full_fusion"
+            "type": "RyzenGenerateModelLLM",
+            "recipe": "token_fusion"
         }
     },
     "log_severity_level": 1,
-    "output_dir": "models/Qwen2-1.5B-vai",
+    "output_dir": "models/Qwen2-1.5B-rai",
     "cache_dir": "olive_cache",
     "no_artifacts": true
 }
diff --git a/Qwen-Qwen2-1.5B/VitisAI/README.md → Qwen-Qwen2-1.5B/RyzenAI/README.md b/Qwen-Qwen2-1.5B/VitisAI/README.md → Qwen-Qwen2-1.5B/RyzenAI/README.md
@@ -5,13 +5,13 @@ This folder contains sample Olive configuration to optimize Qwen models for AMD
 
 | Model Name (Hugging Face)          | Config File Name                |
 | :--------------------------------- | :------------------------------ |
-| `Qwen/Qwen2-1.5B`       | `Qwen2-1.5B_quark_vitisai_llm.json` |
+| `Qwen/Qwen2-1.5B`       | `Qwen2-1.5B_quark_ryzenai_llm.json` |
 
 ## **Run the Quantization Config**
 
 ### **Quark quantization**
 
-For LLMs - follow the below commands to generate the optimized model for VitisAI Execution Provider.
+For LLMs - follow the below commands to generate the optimized model for RyzenAI Execution Provider.
 
 **Platform Support:**
 - ✅ **Windows with CUDA** - Supported
@@ -42,11 +42,11 @@ pip install -e .
 pip install -r requirements.txt
 ```
 
-#### **Install VitisAI LLM dependencies**
+#### **Install RyzenAI LLM dependencies**
 
 ```bash
-cd olive-recipes/Qwen-Qwen2-1.5B/VitisAI
-pip install --force-reinstall -r requirements_vitisai_llm.txt
+cd olive-recipes/Qwen-Qwen2-1.5B/RyzenAI
+pip install --force-reinstall -r requirements_ryzenai_llm.txt
 ```
 
 
@@ -75,14 +75,14 @@ pip install torch==2.7.0 torchvision torchaudio --index-url https://download.pyt
 python -c "import torch; print(torch.__version__)"  # Should print 2.7.0+cpu
 ```
 
-#### **Generate optimized LLM model for VitisAI NPU**
-Follow the above setup instructions, then run the below command to generate the optimized LLM model for VitisAI EP
+#### **Generate optimized LLM model for RyzenAI NPU**
+Follow the above setup instructions, then run the below command to generate the optimized LLM model for RyzenAI EP
 
 ```bash
 # Qwen2-1.5B
-olive run --config Qwen2-1.5B_quark_vitisai_llm.json
+olive run --config Qwen2-1.5B_quark_ryzenai_llm.json
 ```
 
-✅ Optimized model saved in: `models/Qwen2-1.5B-vai/`
+✅ Optimized model saved in: `models/Qwen2-1.5B-rai/`
 
 > **Note:** Output model is saved in `output_dir` mentioned in the json files.
diff --git a/Qwen-Qwen2-1.5B/RyzenAI/info.yaml b/Qwen-Qwen2-1.5B/RyzenAI/info.yaml
@@ -0,0 +1,6 @@
+arch: qwen2
+recipes:
+  - name: Qwen2-1.5B_RyzenAI
+    file: Qwen2-1.5B_quark_ryzenai_llm.json
+    devices: npu
+    eps: RyzenAIExecutionProvider
diff --git a/...ruct/VitisAI/requirements_vitisai_llm.txt → ...1.5B/RyzenAI/requirements_ryzenai_llm.txt b/...ruct/VitisAI/requirements_vitisai_llm.txt → ...1.5B/RyzenAI/requirements_ryzenai_llm.txt
@@ -1,4 +1,4 @@
---extra-index-url=https://pypi.amd.com/olive/1.7.1-5D/simple
+--extra-index-url=https://pypi.amd.com/olive/1.7.1-6D/simple/
 # AMD model generation
 --extra-index-url=https://pypi.amd.com/simple
 accelerate

diff --git a/Qwen-Qwen2-1.5B/VitisAI/info.yaml b/Qwen-Qwen2-1.5B/VitisAI/info.yaml
diff --git a/Qwen-Qwen2-7B-Instruct/VitisAI/Qwen2-7B-Instruct_quark_vitisai_llm.json b/Qwen-Qwen2-7B-Instruct/VitisAI/Qwen2-7B-Instruct_quark_vitisai_llm.json
@@ -0,0 +1,40 @@
+{
+    "input_model": { "type": "HFModel", "model_path": "Qwen/Qwen2-7B-Instruct" },
+    "passes": {
+        "qq": {
+            "type": "QuarkQuantizationVitisAI",
+            "quant_scheme": "w_uint4_per_group_asym",
+            "quant_algo": "awq",
+            "dataset": "pileval_for_awq_benchmark",
+            "data_type": "bfloat16",
+            "num_calib_data": 128,
+            "model_export": [ "hf_format" ],
+            "exclude_layers": [  ],
+            "quant_config": {
+                "name": "awq",
+                "scaling_layers": [
+                    {
+                        "prev_op": "input_layernorm",
+                        "layers": [ "self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj" ],
+                        "inp": "self_attn.q_proj",
+                        "module2inspect": "self_attn"
+                    },
+                    { "prev_op": "self_attn.v_proj", "layers": [ "self_attn.o_proj" ], "inp": "self_attn.o_proj" },
+                    {
+                        "prev_op": "post_attention_layernorm",
+                        "layers": [ "mlp.gate_proj", "mlp.up_proj" ],
+                        "inp": "mlp.gate_proj",
+                        "module2inspect": "mlp"
+                    },
+                    { "prev_op": "mlp.up_proj", "layers": [ "mlp.down_proj" ], "inp": "mlp.down_proj" }
+                ],
+                "model_decoder_layers": "model.layers"
+            }
+        },
+        "mg": { "type": "VitisGenerateModelLLM", "packed_const": false, "cpu_only": false }
+    },
+    "log_severity_level": 1,
+    "output_dir": "models/Qwen2-7B-Instruct-vai",
+    "cache_dir": "cache",
+    "no_artifacts": true
+}