Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ env-*
.coverage
htmlcov/
*.html
vdb_benchmark/vdbbench/benchmark/results/*

# OS files
.DS_Store
Expand Down
37 changes: 33 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,39 @@ full = [
"dlio-benchmark",
]
vectordb = [
"pymilvus>=2.4.0",
"numpy>=1.24",
"pandas>=2.0",
"tabulate>=0.9",
"pymilvus>=2.4.0",
"psycopg2-binary>=2.9",
"pgvector>=0.2",
"elasticsearch>=8.0",
"numpy>=1.24",
"pandas>=2.0",
"pyyaml>=6.0",
"tabulate>=0.9",
]

vectordb-milvus = [
"pymilvus>=2.4.0",
"numpy>=1.24",
"pandas>=2.0",
"pyyaml>=6.0",
"tabulate>=0.9",
]

vectordb-pgvector = [
"psycopg2-binary>=2.9",
"pgvector>=0.2",
"numpy>=1.24",
"pandas>=2.0",
"pyyaml>=6.0",
"tabulate>=0.9",
]

vectordb-elasticsearch = [
"elasticsearch>=8.0",
"numpy>=1.24",
"pandas>=2.0",
"pyyaml>=6.0",
"tabulate>=0.9",
]

[project.urls]
Expand Down
43 changes: 43 additions & 0 deletions tests/configs/elasticsearch_5k_hnsw.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
backend: elasticsearch
mode: both

database:
host: http://localhost:9200

dataset:
collection_name: pr316_es_hnsw_5k
num_vectors: 5000
dimension: 64
distribution: uniform
block_size: 1000
batch_size: 500
seed: 42

query:
num_query_vectors: 100
query_seed: 99

ground_truth:
truth_k: 20
truth_mode: precomputed

index:
index_type: HNSW
metric_type: COSINE
index_params:
m: 16
ef_construction: 64
num_shards: 1

search:
search_k: 10
num_search_rounds: 2
search_batch_size: 5
log_interval: 50
search_params:
num_candidates: 100

workflow:
force: true
compact: false
monitor_interval: 2
42 changes: 42 additions & 0 deletions tests/configs/milvus_10k_hnsw.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
backend: milvus
mode: both

database:
host: 127.0.0.1
port: 19530

dataset:
collection_name: pr316_milvus_hnsw_10k
num_vectors: 10000
dimension: 128
distribution: uniform
block_size: 2000
batch_size: 500
seed: 42

query:
num_query_vectors: 200
query_seed: 99

ground_truth:
truth_k: 50

index:
index_type: HNSW
metric_type: COSINE
index_params:
M: 16
efConstruction: 100
num_shards: 1

search:
search_k: 10
num_search_rounds: 2
search_batch_size: 10
search_params:
ef: 64

workflow:
force: true
compact: true
monitor_interval: 2
47 changes: 47 additions & 0 deletions tests/configs/pgvector_5k_hnsw.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
backend: pgvector
mode: both

database:
host: 127.0.0.1
port: 5432
dbname: postgres
user: postgres
password: postgres

dataset:
collection_name: pr316_pgvector_hnsw_5k
num_vectors: 5000
dimension: 64
distribution: uniform
block_size: 1000
batch_size: 500
seed: 42

query:
num_query_vectors: 100
query_seed: 99

ground_truth:
truth_k: 20
truth_mode: precomputed

index:
index_type: HNSW
metric_type: COSINE
index_params:
M: 16
efConstruction: 64
num_shards: 1

search:
search_k: 10
num_search_rounds: 2
search_batch_size: 5
log_interval: 50
search_params:
ef_search: 40

workflow:
force: true
compact: false
monitor_interval: 2
21 changes: 21 additions & 0 deletions tests/configs/vectordb_readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# VDB modular runner smoke tests

These smoke tests validate the modular backend-agnostic VDB runner added under `vdb_benchmark/vdbbench/benchmark`.

They are intentionally small and are meant for PR validation, not official MLPerf Storage result generation.

## What these tests cover

| Script | Backend | Size | Index |
|---|---:|---:|---|
| `run_milvus_10k_hnsw.sh` | Milvus | 10,000 vectors | HNSW |
| `run_pgvector_5k_hnsw.sh` | PostgreSQL + pgvector | 5,000 vectors | HNSW |
| `run_elasticsearch_5k_hnsw.sh` | Elasticsearch | 5,000 vectors | HNSW |

Each script verifies that the modular runner writes:

```text
query_vectors.npy
ground_truth.npz
search_results.json
benchmark_meta.json
90 changes: 90 additions & 0 deletions tests/run_elasticsearch_5k_hnsw.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/usr/bin/env bash
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

# Prefer git repo root when available.
ROOT_DIR="$(git -C "${SCRIPT_DIR}" rev-parse --show-toplevel 2>/dev/null || true)"

# Fallback: walk upward until pyproject.toml is found.
if [[ -z "${ROOT_DIR}" ]]; then
SEARCH_DIR="${SCRIPT_DIR}"
while [[ "${SEARCH_DIR}" != "/" && ! -f "${SEARCH_DIR}/pyproject.toml" ]]; do
SEARCH_DIR="$(dirname "${SEARCH_DIR}")"
done
ROOT_DIR="${SEARCH_DIR}"
fi

cd "${ROOT_DIR}"

if [[ ! -f "pyproject.toml" ]]; then
echo "ERROR: pyproject.toml not found."
echo "Current directory: $(pwd)"
echo "Set ROOT_DIR manually or run this script from inside the repo."
exit 1
fi


CONFIG="${CONFIG:-tests/configs/elasticsearch_5k_hnsw.yaml}"
OUT_DIR="${OUT_DIR:-/tmp/pr316_elasticsearch_hnsw_5k}"

ELASTICSEARCH_HOST="${ELASTICSEARCH_HOST:-http://localhost:9200}"
ELASTICSEARCH_API_KEY="${ELASTICSEARCH_API_KEY:-}"
ELASTICSEARCH_CLOUD_ID="${ELASTICSEARCH_CLOUD_ID:-}"

echo "Running Elasticsearch modular VDB smoke test"
echo "Config: ${CONFIG}"
echo "Output: ${OUT_DIR}"
echo "Elasticsearch host: ${ELASTICSEARCH_HOST}"

uv sync --extra vectordb-elasticsearch
uv pip install -e ./vdb_benchmark

rm -rf "${OUT_DIR}"

if [[ -n "${ELASTICSEARCH_API_KEY}" || -n "${ELASTICSEARCH_CLOUD_ID}" ]]; then
ELASTICSEARCH__HOST="${ELASTICSEARCH_HOST}" \
ELASTICSEARCH__API_KEY="${ELASTICSEARCH_API_KEY}" \
ELASTICSEARCH__CLOUD_ID="${ELASTICSEARCH_CLOUD_ID}" \
uv run python -m vdbbench.benchmark \
--config "${CONFIG}" \
--backend elasticsearch \
--mode both \
--force \
--output-dir "${OUT_DIR}"
else
ELASTICSEARCH__HOST="${ELASTICSEARCH_HOST}" \
uv run python -m vdbbench.benchmark \
--config "${CONFIG}" \
--backend elasticsearch \
--mode both \
--force \
--output-dir "${OUT_DIR}"
fi

test -f "${OUT_DIR}/query_vectors.npy"
test -f "${OUT_DIR}/ground_truth.npz"
test -f "${OUT_DIR}/search_results.json"
test -f "${OUT_DIR}/benchmark_meta.json"

uv run python - <<PY
import json
from pathlib import Path

out_dir = Path("${OUT_DIR}")
with open(out_dir / "search_results.json", "r", encoding="utf-8") as f:
results = json.load(f)

assert results["total_queries"] == 200, results
assert results["qps"] > 0, results
assert 0 <= results["recall_at_k"] <= 1, results

print("Elasticsearch smoke test passed")
print(json.dumps({
"total_queries": results["total_queries"],
"qps": results["qps"],
"recall_at_k": results["recall_at_k"],
"latency_p50_ms": results["latency_p50_ms"],
"latency_p99_ms": results["latency_p99_ms"],
}, indent=2))
PY
78 changes: 78 additions & 0 deletions tests/run_milvus_10k_hnsw.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/usr/bin/env bash
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

# Prefer git repo root when available.
ROOT_DIR="$(git -C "${SCRIPT_DIR}" rev-parse --show-toplevel 2>/dev/null || true)"

# Fallback: walk upward until pyproject.toml is found.
if [[ -z "${ROOT_DIR}" ]]; then
SEARCH_DIR="${SCRIPT_DIR}"
while [[ "${SEARCH_DIR}" != "/" && ! -f "${SEARCH_DIR}/pyproject.toml" ]]; do
SEARCH_DIR="$(dirname "${SEARCH_DIR}")"
done
ROOT_DIR="${SEARCH_DIR}"
fi

cd "${ROOT_DIR}"

if [[ ! -f "pyproject.toml" ]]; then
echo "ERROR: pyproject.toml not found."
echo "Current directory: $(pwd)"
echo "Set ROOT_DIR manually or run this script from inside the repo."
exit 1
fi

CONFIG="${CONFIG:-tests/configs/milvus_10k_hnsw.yaml}"
OUT_DIR="${OUT_DIR:-/tmp/pr316_milvus_hnsw_10k}"

MILVUS_HOST="${MILVUS_HOST:-127.0.0.1}"
MILVUS_PORT="${MILVUS_PORT:-19530}"

echo "Running Milvus modular VDB smoke test"
echo "Repo root: $(pwd)"
echo "Config: ${CONFIG}"
echo "Output: ${OUT_DIR}"
echo "Milvus: ${MILVUS_HOST}:${MILVUS_PORT}"

uv sync --extra vectordb-milvus
uv pip install -e ./vdb_benchmark

rm -rf "${OUT_DIR}"

MILVUS__HOST="${MILVUS_HOST}" \
MILVUS__PORT="${MILVUS_PORT}" \
uv run python -m vdbbench.benchmark \
--config "${CONFIG}" \
--backend milvus \
--mode both \
--force \
--output-dir "${OUT_DIR}"

test -f "${OUT_DIR}/query_vectors.npy"
test -f "${OUT_DIR}/ground_truth.npz"
test -f "${OUT_DIR}/search_results.json"
test -f "${OUT_DIR}/benchmark_meta.json"

uv run python - <<PY
import json
from pathlib import Path

out_dir = Path("${OUT_DIR}")
with open(out_dir / "search_results.json", "r", encoding="utf-8") as f:
results = json.load(f)

assert results["total_queries"] == 400, results
assert results["qps"] > 0, results
assert 0 <= results["recall_at_k"] <= 1, results

print("Milvus smoke test passed")
print(json.dumps({
"total_queries": results["total_queries"],
"qps": results["qps"],
"recall_at_k": results["recall_at_k"],
"latency_p50_ms": results.get("latency_p50_ms"),
"latency_p99_ms": results.get("latency_p99_ms"),
}, indent=2))
PY
Loading
Loading