diff --git a/olive/cache.py b/olive/cache.py index 7f27a50de..53699b285 100644 --- a/olive/cache.py +++ b/olive/cache.py @@ -541,9 +541,11 @@ def _save_model( output_file = output_dir actual_output_dir = output_dir.parent else: - # Otherwise, create model.onnx in the directory + # Otherwise, create model.onnx in the directory. + # Preserve the source onnx_file_name stem (e.g. model_ctx) so the output + # filename matches what genai_config.json references. actual_output_dir = output_dir - model_file_name = "model" + model_file_name = Path(onnx_file_name).stem if has_additional_files and onnx_file_name else "model" if path_prefix: model_file_name = f"{path_prefix}_{model_file_name}" output_file = output_dir / f"{model_file_name}.onnx" diff --git a/olive/passes/onnx/common.py b/olive/passes/onnx/common.py index 14daffa7c..ba880ae5c 100644 --- a/olive/passes/onnx/common.py +++ b/olive/passes/onnx/common.py @@ -5,6 +5,7 @@ import json import logging import re +from collections.abc import Iterable from copy import deepcopy from pathlib import Path from typing import Any, Callable, Optional, Union @@ -792,40 +793,46 @@ def update_llm_pipeline_genai_config( def update_llm_pipeline_genai_config_gpu( - model: ONNXModelHandler, + model: Union[ONNXModelHandler, CompositeModelHandler], output_model_dir: Union[str, Path], - input_model_path: Union[str, Path], decoder_config_extra: Optional[dict[str, Any]] = None, -) -> ONNXModelHandler: + composite_components: Optional[Iterable[tuple[str, ONNXModelHandler]]] = None, +) -> Union[ONNXModelHandler, CompositeModelHandler]: """Update the LLM pipeline in the model's genai_config.json file. - :param model: The model to update. + :param model: The model (single or composite) to update. + :param output_model_dir: Directory where the updated genai_config.json should be written. :param decoder_config_extra: Extra configuration for the decoder. + :param composite_components: Optional iterable of (component_name, ONNXModelHandler) + used to build a multi-component pipeline. + :return: The same `model` object (with its directory now having updated genai_config.json). """ output_model_dir = Path(output_model_dir) - # update genai_config if it exists + additional_files = model.model_attributes["additional_files"] genai_config_path = None - genai_config_path = Path(input_model_path).parent / "genai_config.json" + for file_path in additional_files: + if Path(file_path).name == "genai_config.json": + genai_config_path = file_path + break - if genai_config_path.exists(): - genai_config_path = str(genai_config_path.resolve()) - else: + if not genai_config_path: return model with open(genai_config_path) as f: genai_config = json.load(f) - # update model_type genai_config["model"]["type"] = "decoder-pipeline" - # Update the provider_options list provider_option = {"qnn": {"backend_type": "gpu"}} - genai_config["model"]["decoder"]["session_options"]["provider_options"] = [provider_option] + decoder = genai_config["model"].setdefault("decoder", {}) + session_opts = decoder.setdefault("session_options", {}) + session_opts["provider_options"] = [provider_option] # update decoder config decoder_config = genai_config["model"]["decoder"] decoder_config.get("sliding_window", {}).pop("slide_inputs", None) + for key, value in (decoder_config_extra or {}).items(): exisiting_value = decoder_config.get(key) if isinstance(exisiting_value, dict): @@ -835,13 +842,39 @@ def update_llm_pipeline_genai_config_gpu( else: decoder_config[key] = value - pipeline_config = {} - component_io_config = model.io_config - pipeline_config["model_onnx"] = { - "filename": Path(model.model_path).name, - "inputs": component_io_config["input_names"], - "outputs": component_io_config["output_names"], - } + # --- Build pipeline_config --- + pipeline_config: dict[str, Any] = {} + + if composite_components is None: + if not isinstance(model, ONNXModelHandler): + handlers = list(model.get_model_components()) + if not handlers: + return model + _, single_handler = handlers[0] + else: + single_handler = model + + component_io_config = single_handler.io_config + component_key = Path(single_handler.model_path).stem + pipeline_config[component_key] = { + "filename": Path(single_handler.model_path).name, + "inputs": component_io_config["input_names"], + "outputs": component_io_config["output_names"], + } + + else: + # Composite case: one entry per component + for comp_name, comp_handler in composite_components: + component_io_config = comp_handler.io_config + pipeline_config[comp_name] = { + "filename": Path(comp_handler.model_path).name, + "inputs": component_io_config["input_names"], + "outputs": component_io_config["output_names"], + } + if comp_name.endswith("iterator"): + pipeline_config[comp_name]["run_on_prompt"] = False + else: + pipeline_config[comp_name]["run_on_token_gen"] = False decoder_config["pipeline"] = [pipeline_config] @@ -849,40 +882,65 @@ def update_llm_pipeline_genai_config_gpu( new_genai_config_path = output_model_dir / "genai_config.json" with new_genai_config_path.open("w") as f: json.dump(genai_config, f, indent=4) + additional_files.remove(genai_config_path) + additional_files.append(str(new_genai_config_path)) return model def update_llm_pipeline_genai_config_gpu_ctxbin( - model_path: Union[str, Path], + model: Union[ONNXModelHandler, CompositeModelHandler], + output_model_path: Union[str, Path], ) -> None: - """Update the filename fields in the model's genai_config.json file from 'model' to 'model_ctx'. + """Update the genai_config.json entry for one context binary component. - The genai_config.json file is updated in place in the model's directory. - :param model_path: Path to the model file. + :param model: Source model is used to locate and update genai_config.json. + :param output_model_path: Path to the context binary output file. """ - # Find genai_config in the model's directory - model_dir = Path(model_path).parent - genai_config_path = model_dir / "genai_config.json" + output_model_path = Path(output_model_path) + + # Extract additional_files from model -- same as update_llm_pipeline_genai_config_gpu + additional_files = model.model_attributes["additional_files"] + genai_config_path = None + for file_path in additional_files: + if Path(file_path).name == "genai_config.json": + genai_config_path = file_path + break + + if not genai_config_path: + return - if not genai_config_path.exists(): + ctx_stem = output_model_path.stem + if not ctx_stem.endswith("_ctx"): return + src_stem = ctx_stem[: -len("_ctx")] + src_filename = f"{src_stem}.onnx" + ctx_filename = f"{ctx_stem}.onnx" with open(genai_config_path) as f: genai_config = json.load(f) - # Update decoder filename to 'model_ctx' - if "decoder" in genai_config.get("model", {}): - if "filename" in genai_config["model"]["decoder"]: - genai_config["model"]["decoder"]["filename"] = "model/model_ctx.onnx" + decoder = genai_config.get("model", {}).get("decoder", {}) - # Update filename in pipeline configuration - decoder_config = genai_config["model"]["decoder"] - if "pipeline" in decoder_config and isinstance(decoder_config["pipeline"], list): - for pipeline_item in decoder_config["pipeline"]: - if "model_onnx" in pipeline_item and "filename" in pipeline_item["model_onnx"]: - pipeline_item["model_onnx"]["filename"] = "model/model_ctx.onnx" + # Update top-level decoder.filename if it points to this model + if decoder.get("filename") == src_filename: + decoder["filename"] = ctx_filename - # Save the updated genai_config back to the same location - with genai_config_path.open("w") as f: + # Update the single matching pipeline entry + for pipeline_item in decoder.get("pipeline", []): + if not isinstance(pipeline_item, dict): + continue + for comp_name in list(pipeline_item.keys()): + comp = pipeline_item[comp_name] + if isinstance(comp, dict) and comp.get("filename") == src_filename: + comp["filename"] = ctx_filename + if comp_name == src_stem: + pipeline_item[ctx_stem] = pipeline_item.pop(comp_name) + break # only one entry matches per call + + # Save to output dir and update additional_files pointer. + new_genai_config_path = output_model_path.parent / "genai_config.json" + with new_genai_config_path.open("w") as f: json.dump(genai_config, f, indent=4) + additional_files.remove(genai_config_path) + additional_files.append(str(new_genai_config_path)) diff --git a/olive/passes/onnx/context_binary.py b/olive/passes/onnx/context_binary.py index ba7fc433b..928b91189 100644 --- a/olive/passes/onnx/context_binary.py +++ b/olive/passes/onnx/context_binary.py @@ -117,6 +117,7 @@ def _run_single_target( "session_options": config.session_options, "embed_context": config.embed_context, "disable_cpu_fallback": config.disable_cpu_fallback, + "model": model, } if isinstance(model, ONNXModelHandler): @@ -243,6 +244,7 @@ def _generate_context_binary( share_ep_contexts: bool = False, stop_share_ep_contexts: bool = False, ignore_missing_cb_bin: bool = False, + model: Optional[Union[ONNXModelHandler, CompositeModelHandler]] = None, ) -> ONNXModelHandler: """Generate context binary for the model. @@ -271,7 +273,7 @@ def _generate_context_binary( if execution_provider == ExecutionProvider.QNNExecutionProvider: if str(device).lower() == "gpu": provider_options["backend_path"] = "libQnnGpu.so" if platform.system() == "Linux" else "QnnGpu.dll" - update_llm_pipeline_genai_config_gpu_ctxbin(model_path) + update_llm_pipeline_genai_config_gpu_ctxbin(model, Path(output_model_path)) else: provider_options["backend_path"] = "libQnnHtp.so" if platform.system() == "Linux" else "QnnHtp.dll" if share_ep_contexts: diff --git a/olive/passes/onnx/static_llm.py b/olive/passes/onnx/static_llm.py index b12ca5c34..294759aff 100644 --- a/olive/passes/onnx/static_llm.py +++ b/olive/passes/onnx/static_llm.py @@ -3,6 +3,7 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- import logging +from copy import deepcopy from pathlib import Path import onnx @@ -56,6 +57,11 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon default_value=64, description="Input length of the context model.", ), + "context_iterator_models": PassConfigParam( + type_=bool, + default_value=True, + description=("To generate context and iterative models. Specifically for QNN GPU"), + ), "group_session_options": PassConfigParam( type_=dict, description=( @@ -182,57 +188,146 @@ def process_context_iterator(component_models, llm_pipeline, output_dir): ) def _run_qnn_gpu(self, model: ONNXModelHandler, config: type[BasePassConfig], output_model_path: Path): + """QNN_GPU path: generate one or more static ONNX models for different context lengths. + + - If config.context_iterator_models is false: generate single model. + - If config.context_iterator_models is true: generate multiple models (ar1 and arN) and return CompositeModelHandler. + """ output_model_dir = Path(output_model_path).with_suffix("") model_path = Path(model.model_path) # --- Step 1: Load model (handle both single and external data) --- try: - model_proto = onnx.load(model_path, load_external_data=True) + base_model_proto = onnx.load(model_path, load_external_data=True) except Exception as e: raise RuntimeError(f"Failed to load ONNX model: {e}") from e - # --- Step 2: Fix symbolic dimensions --- - batch_size, sequence_length = OnnxDAG(model_proto).get_io_shape("input_ids") + # --- Step 2: Get symbolic batch and sequence dims once --- + batch_size, sequence_length = OnnxDAG(base_model_proto).get_io_shape("input_ids") if not (isinstance(batch_size, str) and isinstance(sequence_length, str)): raise ValueError("Input dimensions must be symbolic before static shape fixing.") - param_mapping = {batch_size: config.batch_size, sequence_length: config.context_length} - self.fix_shape(model_proto, param_mapping) + context_iterator_models = getattr(config, "context_iterator_models", True) - # --- Step 3: Save model as external-data format --- - output_model_file = Path(output_model_dir) / "model.onnx" - external_data_file = Path(output_model_dir) / "model.onnx.data" + if not context_iterator_models: + # Single model mode + ctx_lengths_list = [int(config.context_length)] + else: + # Composite model mode → AR1 + AR-N + n = int(config.context_length) + ctx_lengths_list = [n, 1] + + multiple = len(ctx_lengths_list) > 1 + + generated_handlers: dict[int, ONNXModelHandler] = {} + generated_names: dict[int, str] = {} + + for ctx_len in ctx_lengths_list: + # --- Clone base model proto for this variant --- + model_proto = onnx.ModelProto() + model_proto.CopyFrom(base_model_proto) + + # --- Step 3: Fix symbolic dimensions for this context length --- + param_mapping = {batch_size: config.batch_size, sequence_length: ctx_len} + self.fix_shape(model_proto, param_mapping) + + add_version_metadata_to_model_proto(model_proto) + + # --- Step 4: Save as external-data ONNX --- + # single model: "model", composite: "context" (AR-N) or "iterator" (AR-1) + if not multiple: + logical_name = "model" + elif ctx_len == 1: + logical_name = "iterator" + else: + logical_name = "context" + onnx_file_name = f"{logical_name}.onnx" + output_model_file = Path(output_model_dir) / onnx_file_name + external_data_file = Path(output_model_dir) / f"{onnx_file_name}.data" + + output_model_dir.mkdir(parents=True, exist_ok=True) + onnx.save( + model_proto, + str(output_model_file), + save_as_external_data=True, + all_tensors_to_one_file=True, + location=external_data_file.name, + convert_attribute=False, + ) + + # Build handler for this static model + new_model_attributes = deepcopy(model.model_attributes) or {} + handler = ONNXModelHandler( + model_path=output_model_dir, + onnx_file_name=output_model_file.name, + model_attributes=new_model_attributes, + ) + + # Store handler + logical component name + generated_handlers[ctx_len] = handler + generated_names[ctx_len] = logical_name + + # --- Step 5: Update genai_config.json --- + # For single model: pipeline with one component. + # For multiple models: pipeline with multiple components (composite). + if not multiple: + # Single context length + ctx_len = ctx_lengths_list[0] + handler = generated_handlers[ctx_len] + + decoder_config_extra = { + "inputs": { + "past_sequence_length": "past_seq_len", + "total_sequence_length": "total_seq_len", + }, + "sliding_window": { + "window_size": ctx_len, + "pad_value": 0, + "alignment": "left", + "slide_key_value_cache": False, + }, + } - onnx.save( - model_proto, - str(output_model_file), - save_as_external_data=True, - all_tensors_to_one_file=True, - location=external_data_file.name, - convert_attribute=False, + return update_llm_pipeline_genai_config_gpu( + model=handler, + output_model_dir=output_model_dir, + decoder_config_extra=decoder_config_extra, + composite_components=None, + ) + + # Multiple context lengths -> wrap in CompositeModelHandler and create composite pipeline + components = [] + component_names = [] + + for ctx_len, handler in generated_handlers.items(): + components.append(handler) + component_names.append(generated_names[ctx_len]) + + new_model_attributes = deepcopy(model.model_attributes) or {} + + composite = CompositeModelHandler( + model_components=components, model_component_names=component_names, model_attributes=new_model_attributes ) - decoder_config_extra = { + # Build per-component sliding_window config keyed by name + composite_decoder_extra = { "inputs": { "past_sequence_length": "past_seq_len", "total_sequence_length": "total_seq_len", }, "sliding_window": { - "window_size": config.context_length, + "window_size": max(ctx_lengths_list), "pad_value": 0, "alignment": "left", "slide_key_value_cache": False, }, } - input_model_path = model.model_path - model_static = ONNXModelHandler(model_path=output_model_dir, onnx_file_name=output_model_file.name) - return update_llm_pipeline_genai_config_gpu( - model_static, - output_model_dir, - input_model_path, - decoder_config_extra, + model=composite, + output_model_dir=output_model_dir, + decoder_config_extra=composite_decoder_extra, + composite_components=list(zip(component_names, components)), ) @staticmethod