diff --git a/olive/common/hf/utils.py b/olive/common/hf/utils.py index 75649d2e3c..65cbfd0831 100644 --- a/olive/common/hf/utils.py +++ b/olive/common/hf/utils.py @@ -326,6 +326,20 @@ def get_generation_config(model_name_or_path: str, **kwargs) -> Optional["Genera return None +def resolve_diffusers_tokenizer_path(model_path: str, load_kwargs: Optional[dict[str, Any]] = None) -> str: + """Resolve tokenizer path for diffusers pipelines with subfoldered sub-models.""" + pipeline_path = Path(model_path) + load_kwargs = load_kwargs or {} + subfolder = load_kwargs.get("subfolder") or load_kwargs.get("extra_args", {}).get("subfolder") + if not subfolder: + return str(pipeline_path) + + tokenizer_path = pipeline_path / "tokenizer" + if (tokenizer_path / "tokenizer_config.json").exists(): + return str(tokenizer_path) + return str(pipeline_path / subfolder) + + def get_tokenizer(model_name_or_path: str, **kwargs) -> Union["PreTrainedTokenizer", "PreTrainedTokenizerFast"]: """Get HF model's tokenizer.""" tokenizer = from_pretrained(AutoTokenizer, model_name_or_path, "tokenizer", **kwargs) diff --git a/olive/passes/onnx/model_builder.py b/olive/passes/onnx/model_builder.py index e4aa0fdbc2..944d67dd34 100644 --- a/olive/passes/onnx/model_builder.py +++ b/olive/passes/onnx/model_builder.py @@ -161,6 +161,22 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon required=False, description="Remove language modeling head from your ONNX model.", ), + "use_cache": PassConfigParam( + type_=bool, + required=False, + description=( + "Include past/present key-value cache inputs and outputs in the ONNX model. " + "Set to false for Stable Diffusion text encoder exports." + ), + ), + "hidden_states_layers": PassConfigParam( + type_=list[int], + required=False, + description=( + "Hugging Face hidden_states layer indices to concatenate into prompt_embeds " + "(for example [9, 18, 27] for Flux/Qwen3 text encoders)." + ), + ), "enable_cuda_graph": PassConfigParam( type_=bool, required=False, diff --git a/olive/passes/pytorch/train_utils.py b/olive/passes/pytorch/train_utils.py index 7900b827e4..6fba79bb44 100644 --- a/olive/passes/pytorch/train_utils.py +++ b/olive/passes/pytorch/train_utils.py @@ -14,6 +14,7 @@ from transformers import __version__ as transformers_version from olive.common.config_utils import NestedConfig, validate_config +from olive.common.hf.utils import resolve_diffusers_tokenizer_path from olive.common.utils import cleanup_memory from olive.data.config import DataConfig from olive.data.template import huggingface_data_config_template @@ -279,7 +280,7 @@ def get_calibration_dataset( """ if not data_config and isinstance(model, HfModelHandler): data_config = get_calibration_data_config( - model.model_name_or_path, + resolve_diffusers_tokenizer_path(model.model_name_or_path, model.get_load_kwargs()), trust_remote_code=model.get_load_kwargs().get("trust_remote_code", False), split=split, batch_size=batch_size, diff --git a/olive/workflows/run/config.py b/olive/workflows/run/config.py index ff641fd728..b30ccca672 100644 --- a/olive/workflows/run/config.py +++ b/olive/workflows/run/config.py @@ -11,6 +11,7 @@ from olive.cache import CacheConfig from olive.common.config_utils import NestedConfig, validate_config from olive.common.constants import DEFAULT_CACHE_DIR, DEFAULT_HF_TASK, DEFAULT_WORKFLOW_ID +from olive.common.hf.utils import resolve_diffusers_tokenizer_path from olive.data.config import DataComponentConfig, DataConfig from olive.data.container.dummy_data_container import TRANSFORMER_DUMMY_DATA_CONTAINER from olive.data.container.huggingface_container import HuggingfaceContainer @@ -224,7 +225,9 @@ def validate_data_configs_with_hf_model(cls, v, info): model_name = input_model_config["config"]["model_attributes"].get("_name_or_path") else: task = input_model_config["config"].get("task", DEFAULT_HF_TASK) - model_name = input_model_config["config"]["model_path"] + model_path = input_model_config["config"]["model_path"] + load_kwargs = input_model_config["config"].get("load_kwargs") or {} + model_name = resolve_diffusers_tokenizer_path(model_path, load_kwargs) model_info = { "model_name": model_name, diff --git a/test/passes/onnx/test_model_builder.py b/test/passes/onnx/test_model_builder.py index 0f71535db9..9752ffe52e 100644 --- a/test/passes/onnx/test_model_builder.py +++ b/test/passes/onnx/test_model_builder.py @@ -187,6 +187,37 @@ def fake_create_model(*_, **kwargs): assert fake_builder.create_model.call_args.kwargs["input_path"] == str(test_model_path) +def test_model_builder_prompt_embeds_options_forwarded(tmp_path, monkeypatch): + input_model = make_local_tiny_llama(tmp_path / "input_model", "hf") + output_folder = tmp_path / "output_model" + captured_kwargs = {} + + def fake_create_model(*_, **kwargs): + captured_kwargs.update(kwargs) + output_dir = Path(kwargs["output_dir"]) + (output_dir / kwargs["filename"]).write_text("dummy onnx file") + (output_dir / "genai_config.json").write_text("{}") + + _mock_genai_builder(monkeypatch, fake_create_model) + + p = create_pass_from_dict( + ModelBuilder, + { + "precision": "int4", + "exclude_lm_head": True, + "use_cache": False, + "hidden_states_layers": [9, 18, 27], + }, + disable_search=True, + ) + output_model = p.run(input_model, output_folder) + + assert isinstance(output_model, ONNXModelHandler) + assert captured_kwargs["exclude_lm_head"] is True + assert captured_kwargs["use_cache"] is False + assert captured_kwargs["hidden_states_layers"] == [9, 18, 27] + + def test_model_builder_apply_annotations_on_single_file_fallback(tmp_path, monkeypatch): def fake_create_model( model_name, input_path, output_dir, precision, execution_provider, cache_dir, filename, **kwargs