Skip to content
Draft
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/inference_endpoint/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,11 @@ def _add_shared_benchmark_args(parser):
parser.add_argument(
"--report-dir", type=Path, help="Path to save detailed benchmark report"
)
parser.add_argument(
"--ensure-submission-checker-compatibility",
action="store_true",
Comment thread
anandhu-eng marked this conversation as resolved.
help="Enable loadgen compatibility mode for submission checker",
Comment thread
anandhu-eng marked this conversation as resolved.
)
Comment thread
anandhu-eng marked this conversation as resolved.


def _add_online_specific_args(parser):
Expand Down
28 changes: 27 additions & 1 deletion src/inference_endpoint/commands/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,11 @@
from transformers import AutoTokenizer
from transformers.utils import logging as transformers_logging

from inference_endpoint.commands.utils import get_default_report_path
from inference_endpoint.commands.utils import (
generate_mlperf_log_details_submission_checker,
generate_user_conf_submission_checker,
get_default_report_path,
)
from inference_endpoint.config.runtime_settings import RuntimeSettings
from inference_endpoint.config.schema import (
APIType,
Expand Down Expand Up @@ -276,6 +280,9 @@ def _build_config_from_cli(
timeout = getattr(args, "timeout", None)
verbose_level = getattr(args, "verbose", 0)
api_type = APIType(getattr(args, "api_type", "openai"))
ensure_submission_checker_compatibility = getattr(
args, "ensure_submission_checker_compatibility", False
)
# Build BenchmarkConfig from CLI params
return BenchmarkConfig(
name=f"cli_{benchmark_mode}",
Expand Down Expand Up @@ -331,6 +338,7 @@ def _build_config_from_cli(
report_dir=report_dir,
timeout=timeout,
verbose=verbose_level > 0,
ensure_submission_checker_compatibility=ensure_submission_checker_compatibility,
)


Expand Down Expand Up @@ -663,6 +671,24 @@ def signal_handler(signum, frame):
except Exception as e:
logger.error(f"Save failed: {e}")

if config.ensure_submission_checker_compatibility:
try:
# convert the runtime_settings.json to user.conf format and
generate_user_conf_submission_checker(report_dir)
except Exception as e:
logger.error(
f"Failed to generate user conf for submission checker: {e}"
Comment thread
anandhu-eng marked this conversation as resolved.
)
raise
try:
# generate mlperf_log_details.txt from summary.json
generate_mlperf_log_details_submission_checker(report_dir, strict=True)
except Exception as e:
logger.error(
f"Failed to generate mlperf_log_details.txt for submission checker: {e}"
)
raise

except KeyboardInterrupt:
logger.warning("Benchmark interrupted by user")
raise
Expand Down
94 changes: 94 additions & 0 deletions src/inference_endpoint/commands/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import argparse
import datetime
import json
import logging
import os
import platform
Expand All @@ -31,6 +32,7 @@
from pydantic import ValidationError as PydanticValidationError

from .. import __version__
from ..config.constants import ENDPOINTS_TO_LOADGEN_KEY_MAPPING
from ..config.schema import TEMPLATE_TYPE_MAP, BenchmarkConfig
from ..config.yaml_loader import ConfigError, ConfigLoader
from ..exceptions import InputValidationError, SetupError
Expand Down Expand Up @@ -313,3 +315,95 @@ def get_default_report_path() -> Path:
return Path(
f"{tempfile.gettempdir()}/reports_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
)


def generate_user_conf_submission_checker(report_dir: Path) -> None:
"""Generate user.conf file for submission checker from runtime_settings.json.

Converts endpoints runtime_settings keys to loadgen keys using the mapping
defined in config.constants.ENDPOINTS_TO_LOADGEN_KEY_MAPPING.

Args:
report_dir: Path to the report directory containing runtime_settings.json.

Raises:
FileNotFoundError: If runtime_settings.json does not exist in report_dir.
"""

runtime_settings_path = report_dir / "runtime_settings.json"
user_conf_path = report_dir / "user.conf"

if not runtime_settings_path.exists():
logger.error(f"runtime_settings.json not found in {report_dir}")
raise FileNotFoundError(f"runtime_settings.json not found in {report_dir}")
try:
with open(runtime_settings_path) as f:
runtime_settings = json.load(f)

with open(user_conf_path, "w") as f:
for key, value in runtime_settings.items():
# Map endpoints key to loadgen key if mapping exists, otherwise use same key
loadgen_key = ENDPOINTS_TO_LOADGEN_KEY_MAPPING.get(key, key)
f.write(f"*.*.{loadgen_key}={value}\n")

logger.info(f"Generated user.conf at {user_conf_path}")

except Exception as e:
logger.error(f"Failed to generate user.conf: {e}")
Comment thread
anandhu-eng marked this conversation as resolved.
Comment thread
anandhu-eng marked this conversation as resolved.
raise


def generate_mlperf_log_details_submission_checker(
report_dir: Path, strict: bool
) -> None:
"""Generate mlperf_log_details.txt file for submission checker from summary.json.

Converts endpoints summary keys to loadgen keys using the mapping
defined in config.constants.ENDPOINTS_TO_LOADGEN_KEY_MAPPING.

Args:
report_dir: Path to the report directory containing summary.json.

Raises:
FileNotFoundError: If runtime_settings.json does not exist in report_dir.
"""

summary_path = report_dir / "summary.json"
log_details_path = report_dir / "mlperf_log_details.txt"
marker = ":::ENDPTS"

if not summary_path.exists():
logger.error(f"summary.json not found in {report_dir}")
raise FileNotFoundError(f"summary.json not found in {report_dir}")
try:
with (
open(summary_path) as summary_file,
open(log_details_path, "w") as output_file,
):
for line in summary_file:
line = line.strip()
if line.find(marker) == 0:
try:
record = json.loads(line[len(marker) :])
except json.JSONDecodeError as e:
if strict:
logger.error(f"Encountered invalid line: {line} Error: {e}")
raise
else:
logger.warning(f"Skipping invalid line: {line}")
continue
# map keys
original_key = record.get("key")
if original_key in ENDPOINTS_TO_LOADGEN_KEY_MAPPING:
record["key"] = ENDPOINTS_TO_LOADGEN_KEY_MAPPING[original_key]
output_file.write(
f"{marker} {json.dumps(record, separators=(',', ':'))}\n"
)
else:
logger.warning(f"Found invalid line {line}, skipping.")

logger.info(f"Generated mlperf_log_details.txt at {log_details_path}")

except Exception as e:
logger.error(f"Failed to generate mlperf_log_details.txt: {e}")
raise
86 changes: 86 additions & 0 deletions src/inference_endpoint/config/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Global constants and mappings for the inference endpoint package."""

# Mapping from endpoints results keys to MLPerf loadgen and submission checker supported keys
# This ensures compatibility when generating user.conf and mlperf_log_details.txt for submission checker
# Format: {"endpoints_key": "loadgen_key"}
ENDPOINTS_TO_LOADGEN_KEY_MAPPING = {
"endpoints_version": "loadgen_version",
"endpoints_git_commit_date": "loadgen_git_commit_date",
"endpoints_git_commit_hash": "loadgen_git_commit_hash",
"test_datetime": "test_datetime",
"n_samples_issued": "qsl_reported_total_count",
"n_samples_from_dataset": "qsl_reported_performance_count",
"effective_scenario": "effective_scenario",
"mode": "effective_test_mode",
"streaming": "streaming",
"output_sequence_lengths.min": "min_output_tokens",
"output_sequence_lengths.max": "max_output_tokens",
"load_pattern": "load_pattern",
"min_duration_ms": "effective_min_duration_ms",
"max_duration_ms": "effective_max_duration_ms",
"effective_target_duration_ms": "effective_target_duration_ms",
"min_sample_count": "effective_min_query_count",
"effective_sample_index_rng_seed": "effective_sample_index_rng_seed",
"effective_schedule_rng_seed": "effective_schedule_rng_seed",
"effective_sample_concatenate_permutation": "effective_sample_concatenate_permutation",
"effective_samples_per_query": "effective_samples_per_query",
"generated_query_count": "generated_query_count",
"generated_query_duration": "generated_query_duration",
"target_qps": "effective_target_qps", # (results_summary.json)
"result_scheduled_samples_per_sec": "result_scheduled_samples_per_sec",
"qps": "result_completed_samples_per_sec",
"results_sample_per_second": "results_sample_per_second",
"effective_max_concurrency": "effective_max_async_queries",
"effective_target_latency_ns": "effective_target_latency_ns",
"effective_target_latency_percentile": "effective_target_latency_percentile",
"latency.min": "result_min_latency_ns",
"latency.max": "result_max_latency_ns",
"latency.avg": "result_mean_latency_ns",
"latency.percentiles.50": "result_50.00_percentile_latency_ns",
"latency.percentiles.90": "result_90.00_percentile_latency_ns",
"latency.percentiles.95": "result_95.00_percentile_latency_ns",
"latency.percentiles.99": "result_99.00_percentile_latency_ns",
"latency.percentiles.99.9": "result_99.90_percentile_latency_ns",
"ttft.min": "result_first_token_min_latency_ns",
"ttft.max": "result_first_token_max_latency_ns",
"ttft.avg": "result_first_token_mean_latency_ns",
"ttft.percentiles.50": "result_first_token_50.00_percentile_latency_ns",
"ttft.percentiles.90": "result_first_token_90.00_percentile_latency_ns",
"ttft.percentiles.95": "result_first_token_95.00_percentile_latency_ns",
"ttft.percentiles.99": "result_first_token_99.00_percentile_latency_ns",
"ttft.percentiles.99.9": "result_first_token_99.90_percentile_latency_ns",
"tpot.percentiles.50": "result_time_per_output_token_50.00_percentile_ns",
"tpot.percentiles.90": "result_time_per_output_token_90.00_percentile_ns",
"tpot.percentiles.95": "result_time_per_output_token_95.00_percentile_ns",
"tpot.percentiles.99": "result_time_per_output_token_99.00_percentile_ns",
"tpot.percentiles.99.9": "result_time_per_output_token_99.90_percentile_ns",
"tpot.min": "result_time_to_output_token_min",
"tpot.max": "result_time_to_output_token_max",
"tpot.avg": "result_time_to_output_token_mean",
"tps": "result_completed_tokens_per_second",
"result_validity": "result_validity",
"result_perf_constraints_met": "result_perf_constraints_met",
"result_min_duration_met": "result_min_duration_met",
"result_min_queries_met": "result_min_queries_met",
"early_stopping_met": "early_stopping_met",
"early_stopping_ttft_result": "early_stopping_ttft_result",
"early_stopping_tpot_result": "early_stopping_tpot_result",
"result.total": "result_query_count",
"result_overlatency_query_count": "result_overlatency_query_count",
"result.failed": "num_errors",
}
1 change: 1 addition & 0 deletions src/inference_endpoint/config/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,7 @@ class BenchmarkConfig(BaseModel):
report_dir: Path | None = None
timeout: int | None = None
verbose: bool = False
ensure_submission_checker_compatibility: bool = False

@classmethod
def from_yaml_file(cls, path: Path) -> BenchmarkConfig:
Expand Down
Loading