From 74f4365657f58b82c2fa160ea2108d9132ebee57 Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Wed, 20 May 2026 16:00:33 +0800 Subject: [PATCH 01/31] feat(grep): integrate VikingDB bm25 keyword search for grep engine --- crates/ov_cli/src/client.rs | 14 +- crates/ov_cli/src/commands/search.rs | 6 + crates/ov_cli/src/handlers.rs | 6 + crates/ov_cli/src/main.rs | 15 ++ openviking/async_client.py | 6 + openviking/client/local.py | 6 + openviking/server/routers/search.py | 8 +- openviking/service/fs_service.py | 6 + openviking/storage/collection_schemas.py | 31 +++ .../queuefs/embedding_msg_converter.py | 4 + .../storage/vectordb/collection/collection.py | 9 +- .../vectordb/collection/http_collection.py | 39 +++- .../vectordb/collection/local_collection.py | 4 + .../vectordb/collection/vikingdb_clients.py | 2 + .../collection/vikingdb_collection.py | 5 + .../volcengine_api_key_collection.py | 5 + .../vectordb/collection/volcengine_clients.py | 4 + .../collection/volcengine_collection.py | 5 + .../storage/vectordb/service/app_models.py | 2 + .../storage/vectordb/utils/validation.py | 1 + openviking/storage/vectordb_adapters/base.py | 35 +++ openviking/storage/viking_fs.py | 218 ++++++++++++++++-- .../storage/viking_vector_index_backend.py | 77 ++++++- openviking/sync_client.py | 8 +- openviking_cli/client/base.py | 3 + openviking_cli/client/http.py | 6 + openviking_cli/client/sync_http.py | 8 +- tests/storage/mock_backend.py | 2 + tests/storage/test_collection_schemas.py | 9 +- tests/storage/test_rebuild_schema.py | 9 +- 30 files changed, 516 insertions(+), 37 deletions(-) diff --git a/crates/ov_cli/src/client.rs b/crates/ov_cli/src/client.rs index c792d15d39..e7e39841e7 100644 --- a/crates/ov_cli/src/client.rs +++ b/crates/ov_cli/src/client.rs @@ -425,8 +425,11 @@ impl HttpClient { ignore_case: bool, node_limit: i32, level_limit: i32, + engine: Option, + switch_to_remote_threshold: Option, + remote_return_limit: Option, ) -> Result { - let body = serde_json::json!({ + let mut body = serde_json::json!({ "uri": uri, "exclude_uri": exclude_uri, "pattern": pattern, @@ -434,6 +437,15 @@ impl HttpClient { "node_limit": node_limit, "level_limit": level_limit, }); + if let Some(eng) = engine { + body["engine"] = serde_json::json!(eng); + } + if let Some(threshold) = switch_to_remote_threshold { + body["switch_to_remote_threshold"] = serde_json::json!(threshold); + } + if let Some(limit) = remote_return_limit { + body["remote_return_limit"] = serde_json::json!(limit); + } self.post("/api/v1/search/grep", &body).await } diff --git a/crates/ov_cli/src/commands/search.rs b/crates/ov_cli/src/commands/search.rs index b247a54f15..a4a934a05e 100644 --- a/crates/ov_cli/src/commands/search.rs +++ b/crates/ov_cli/src/commands/search.rs @@ -69,6 +69,9 @@ pub async fn grep( ignore_case: bool, node_limit: i32, level_limit: i32, + engine: Option, + switch_to_remote_threshold: Option, + remote_return_limit: Option, output_format: OutputFormat, compact: bool, ) -> Result<()> { @@ -80,6 +83,9 @@ pub async fn grep( ignore_case, node_limit, level_limit, + engine, + switch_to_remote_threshold, + remote_return_limit, ) .await?; output_success(&result, output_format, compact); diff --git a/crates/ov_cli/src/handlers.rs b/crates/ov_cli/src/handlers.rs index 3f1747807d..afaf254c60 100644 --- a/crates/ov_cli/src/handlers.rs +++ b/crates/ov_cli/src/handlers.rs @@ -1177,6 +1177,9 @@ pub async fn handle_grep( ignore_case: bool, node_limit: i32, level_limit: i32, + engine: Option, + switch_to_remote_threshold: Option, + remote_return_limit: Option, ctx: CliContext, ) -> Result<()> { // Prevent grep from root directory to avoid excessive server load and timeouts @@ -1215,6 +1218,9 @@ pub async fn handle_grep( ignore_case, node_limit, level_limit, + engine, + switch_to_remote_threshold, + remote_return_limit, ctx.output_format, ctx.compact, ) diff --git a/crates/ov_cli/src/main.rs b/crates/ov_cli/src/main.rs index 4e00aea5c2..b6f44b1337 100644 --- a/crates/ov_cli/src/main.rs +++ b/crates/ov_cli/src/main.rs @@ -502,6 +502,15 @@ enum Commands { /// Maximum depth level to traverse (default: 10) #[arg(short = 'L', long = "level-limit", default_value = "10")] level_limit: i32, + /// Search engine mode: "auto" (default) or "fs" + #[arg(long = "engine", default_value = "auto")] + engine: Option, + /// L2 record count threshold to switch to vikingdb; 0 means always use vikingdb + #[arg(long = "switch-to-remote-threshold", default_value = "1000")] + switch_to_remote_threshold: Option, + /// Maximum files recalled by vikingdb bm25 (1-100000) + #[arg(long = "remote-return-limit", default_value = "100")] + remote_return_limit: Option, }, /// [Data] Run file glob pattern search Glob { @@ -1489,6 +1498,9 @@ async fn main() { ignore_case, node_limit, level_limit, + engine, + switch_to_remote_threshold, + remote_return_limit, } => { handlers::handle_grep( uri, @@ -1497,6 +1509,9 @@ async fn main() { ignore_case, node_limit, level_limit, + engine, + switch_to_remote_threshold, + remote_return_limit, ctx, ) .await diff --git a/openviking/async_client.py b/openviking/async_client.py index 92cd1e17ef..a694a2a2da 100644 --- a/openviking/async_client.py +++ b/openviking/async_client.py @@ -475,6 +475,9 @@ async def grep( case_insensitive: bool = False, node_limit: Optional[int] = None, exclude_uri: Optional[str] = None, + engine: str = "auto", + switch_to_remote_threshold: int = 1000, + remote_return_limit: int = 100, ) -> Dict: """Content search""" await self._ensure_initialized() @@ -484,6 +487,9 @@ async def grep( case_insensitive=case_insensitive, node_limit=node_limit, exclude_uri=exclude_uri, + engine=engine, + switch_to_remote_threshold=switch_to_remote_threshold, + remote_return_limit=remote_return_limit, ) async def glob(self, pattern: str, uri: str = "viking://") -> Dict: diff --git a/openviking/client/local.py b/openviking/client/local.py index 27012a174f..640fd6a7d5 100644 --- a/openviking/client/local.py +++ b/openviking/client/local.py @@ -371,6 +371,9 @@ async def grep( node_limit: Optional[int] = None, exclude_uri: Optional[str] = None, level_limit: int = 5, + engine: str = "auto", + switch_to_remote_threshold: int = 1000, + remote_return_limit: int = 100, ) -> Dict[str, Any]: """Content search with pattern.""" return await self._service.fs.grep( @@ -381,6 +384,9 @@ async def grep( node_limit=node_limit, exclude_uri=exclude_uri, level_limit=level_limit, + engine=engine, + switch_to_remote_threshold=switch_to_remote_threshold, + remote_return_limit=remote_return_limit, ) async def glob(self, pattern: str, uri: str = "viking://") -> Dict[str, Any]: diff --git a/openviking/server/routers/search.py b/openviking/server/routers/search.py index de2f4ec63e..df8eb5fbe0 100644 --- a/openviking/server/routers/search.py +++ b/openviking/server/routers/search.py @@ -6,7 +6,7 @@ from typing import Any, Dict, List, Literal, Optional, Union from fastapi import APIRouter, Depends -from pydantic import BaseModel +from pydantic import BaseModel, Field from openviking.core.path_variables import resolve_path_variables from openviking.pyagfs.exceptions import AGFSClientError, AGFSNotFoundError @@ -111,6 +111,9 @@ class GrepRequest(BaseModel): case_insensitive: bool = False node_limit: Optional[int] = None level_limit: int = 5 + engine: Literal["auto", "fs"] = "auto" + switch_to_remote_threshold: int = Field(default=1000, ge=0, description="L2 record count threshold to switch to vikingdb; 0 means always use vikingdb") + remote_return_limit: int = Field(default=100, ge=1, le=100000, description="Maximum files recalled by vikingdb bm25") class GlobRequest(BaseModel): @@ -228,6 +231,9 @@ async def grep( case_insensitive=request.case_insensitive, node_limit=request.node_limit, level_limit=request.level_limit, + engine=request.engine, + switch_to_remote_threshold=request.switch_to_remote_threshold, + remote_return_limit=request.remote_return_limit, ) except AGFSNotFoundError: raise NotFoundError(resolved_uri, "file") diff --git a/openviking/service/fs_service.py b/openviking/service/fs_service.py index 64a9d8355c..3a52210add 100644 --- a/openviking/service/fs_service.py +++ b/openviking/service/fs_service.py @@ -241,6 +241,9 @@ async def grep( case_insensitive: bool = False, node_limit: Optional[int] = None, level_limit: int = 5, + engine: str = "auto", + switch_to_remote_threshold: int = 1000, + remote_return_limit: int = 100, ) -> Dict: """Content search.""" viking_fs = self._ensure_initialized() @@ -253,6 +256,9 @@ async def grep( case_insensitive=case_insensitive, node_limit=node_limit, level_limit=level_limit, + engine=engine, + switch_to_remote_threshold=switch_to_remote_threshold, + remote_return_limit=remote_return_limit, ctx=ctx, ) diff --git a/openviking/storage/collection_schemas.py b/openviking/storage/collection_schemas.py index 00e1767f37..cb44712c0d 100644 --- a/openviking/storage/collection_schemas.py +++ b/openviking/storage/collection_schemas.py @@ -42,6 +42,14 @@ EMBEDDING_META_MARKER = "\n\n[openviking.embedding]\n" +def _parse_version(v: str) -> tuple: + """Parse a semver-like string into a comparable tuple of ints.""" + try: + return tuple(int(x) for x in v.split(".")) + except (ValueError, AttributeError): + return (0, 0, 0) + + @dataclass class RequestQueueStats: processed: int = 0 @@ -103,6 +111,7 @@ def context_collection( {"FieldName": "description", "FieldType": "string"}, {"FieldName": "tags", "FieldType": "string"}, {"FieldName": "abstract", "FieldType": "string"}, + {"FieldName": "content", "FieldType": "text"}, {"FieldName": "account_id", "FieldType": "string"}, {"FieldName": "owner_user_id", "FieldType": "string"}, {"FieldName": "owner_agent_id", "FieldType": "string"}, @@ -131,6 +140,9 @@ def context_collection( "Description": description or "Unified context collection", "Fields": fields, "ScalarIndex": scalar_index, + "FullText": [ + {"Field": "content", "Analyzer": {"Tokenizer": "standard"}}, + ], } @@ -169,6 +181,7 @@ def _build_embedding_metadata(config: "OpenVikingConfig") -> Dict[str, Any]: "model": model, "dimension": dimension, "model_identity": model_identity, + "schema_version": "0.3.18", } @@ -248,6 +261,24 @@ async def init_context_collection(storage) -> bool: base_description, existing_embedding_meta = _decode_collection_description( existing_meta.get("Description") ) + + # Schema compatibility check: warn if collection was created by older OV version + if existing_embedding_meta: + existing_schema_version = existing_embedding_meta.get("schema_version", "0.0.0") + if _parse_version(existing_schema_version) < _parse_version("0.3.18"): + fields = existing_meta.get("Fields", []) + has_content = any(f.get("FieldName") == "content" and f.get("FieldType") == "text" for f in fields) + fulltext = existing_meta.get("FullText") or [] + has_content_fulltext = any(ft.get("Field") == "content" for ft in fulltext) + if not (has_content and has_content_fulltext): + logger.warning( + "Collection schema is outdated (created by OV %s, requires >= 0.3.18). " + "Missing 'content' field or FullText config. " + "grep engine=auto will fall back to fs. " + "Recreate the collection to enable vikingdb-based grep.", + existing_schema_version, + ) + if existing_embedding_meta == embedding_meta: return False diff --git a/openviking/storage/queuefs/embedding_msg_converter.py b/openviking/storage/queuefs/embedding_msg_converter.py index c681f799b4..6f983546f9 100644 --- a/openviking/storage/queuefs/embedding_msg_converter.py +++ b/openviking/storage/queuefs/embedding_msg_converter.py @@ -68,6 +68,10 @@ def from_context(context: Context) -> EmbeddingMsg: resolved_level = int(resolved_level.value) context_data["level"] = int(resolved_level) + # Store vectorization text in content field for bm25 full-text search. + # Truncate to 64KB (VikingDB text field limit). + context_data["content"] = vectorization_text[:65536] + embedding_msg = EmbeddingMsg( message=vectorization_text, context_data=context_data, diff --git a/openviking/storage/vectordb/collection/collection.py b/openviking/storage/vectordb/collection/collection.py index 4d50cdafef..c42b90fc79 100644 --- a/openviking/storage/vectordb/collection/collection.py +++ b/openviking/storage/vectordb/collection/collection.py @@ -73,6 +73,8 @@ def search_by_keywords( offset: int = 0, filters: Optional[Dict[str, Any]] = None, output_fields: Optional[List[str]] = None, + mode: Optional[str] = None, + fields: Optional[List[str]] = None, ) -> SearchResult: raise NotImplementedError @@ -348,6 +350,8 @@ def search_by_keywords( offset: int = 0, filters: Optional[Dict[str, Any]] = None, output_fields: Optional[List[str]] = None, + mode: Optional[str] = None, + fields: Optional[List[str]] = None, ): """Search by keywords or query string using vectorization. @@ -360,6 +364,8 @@ def search_by_keywords( filters (Optional[Dict[str, Any]]): Query filters to narrow down results. Defaults to None. output_fields (Optional[List[str]]): List of field names to include in results. If None, returns all fields. Defaults to None. + mode (Optional[str]): Search mode, e.g. "bm25" for full-text search. Defaults to None. + fields (Optional[List[str]]): Text fields to search in (for bm25 mode). Defaults to None. Returns: SearchResult: Search results containing matching documents with scores and field values. @@ -371,7 +377,8 @@ def search_by_keywords( if self.__collection is None: raise RuntimeError("Collection is closed") return self.__collection.search_by_keywords( - index_name, keywords, query, limit, offset, filters, output_fields + index_name, keywords, query, limit, offset, filters, output_fields, + mode, fields ) def search_by_id( diff --git a/openviking/storage/vectordb/collection/http_collection.py b/openviking/storage/vectordb/collection/http_collection.py index 530a7af6c8..8b831ed0a9 100644 --- a/openviking/storage/vectordb/collection/http_collection.py +++ b/openviking/storage/vectordb/collection/http_collection.py @@ -15,10 +15,15 @@ SearchResult, ) +import openviking + # Default request timeout (seconds) DEFAULT_TIMEOUT = 30 -headers = {"Content-Type": "application/json"} +headers = { + "Content-Type": "application/json", + "User-Agent": f"openviking/{openviking.__version__}", +} def get_or_create_http_collection( @@ -42,6 +47,10 @@ def get_or_create_http_collection( url = "http://{}:{}/CreateVikingdbCollection".format(host, port) if "Fields" in meta_data: meta_data["Fields"] = json.dumps(meta_data["Fields"]) + if "FullText" in meta_data: + meta_data["FullText"] = json.dumps(meta_data["FullText"]) + if "ScalarIndex" in meta_data: + meta_data["ScalarIndex"] = json.dumps(meta_data["ScalarIndex"]) response = requests.post(url, headers=headers, json=meta_data, timeout=DEFAULT_TIMEOUT) # logger.info(f"CreateVikingdbCollection response: {response.text}") if response.status_code == 200: @@ -532,22 +541,28 @@ def search_by_keywords( offset: int = 0, filters: Optional[Dict[str, Any]] = None, output_fields: Optional[List[str]] = None, + mode: Optional[str] = None, + fields: Optional[List[str]] = None, ) -> SearchResult: url = self.url_prefix + "api/vikingdb/data/search/keywords" + payload = { + "project": self.project_name, + "collection_name": self.collection_name, + "index_name": index_name, + "keywords": json.dumps(keywords) if keywords else None, + "query": query, + "filter": json.dumps(filters) if filters else None, + "output_fields": json.dumps(output_fields) if output_fields else None, + "limit": limit, + "offset": offset, + "mode": mode, + "fields": json.dumps(fields) if fields else None, + } + payload = {k: v for k, v in payload.items() if v is not None} response = requests.post( url, headers=headers, - json={ - "project": self.project_name, - "collection_name": self.collection_name, - "index_name": index_name, - "keywords": json.dumps(keywords) if keywords else None, - "query": query, - "filter": json.dumps(filters) if filters else None, - "output_fields": json.dumps(output_fields) if output_fields else None, - "limit": limit, - "offset": offset, - }, + json=payload, timeout=DEFAULT_TIMEOUT, ) # logger.info(f"SearchByKeywords response: {response.text}") diff --git a/openviking/storage/vectordb/collection/local_collection.py b/openviking/storage/vectordb/collection/local_collection.py index 5d1797dea7..5285b83382 100644 --- a/openviking/storage/vectordb/collection/local_collection.py +++ b/openviking/storage/vectordb/collection/local_collection.py @@ -436,6 +436,8 @@ def search_by_keywords( offset: int = 0, filters: Optional[Dict[str, Any]] = None, output_fields: Optional[List[str]] = None, + mode: Optional[str] = None, + fields: Optional[List[str]] = None, ) -> SearchResult: """Search by keywords by generating vectors and calling search_by_vector. @@ -447,6 +449,8 @@ def search_by_keywords( offset: Number of results to skip filters: Filter conditions output_fields: List of fields to return + mode: Search mode (ignored for local backend) + fields: Text fields to search (ignored for local backend) Returns: SearchResult: Search results diff --git a/openviking/storage/vectordb/collection/vikingdb_clients.py b/openviking/storage/vectordb/collection/vikingdb_clients.py index 4f1d25520f..295cdaab0a 100644 --- a/openviking/storage/vectordb/collection/vikingdb_clients.py +++ b/openviking/storage/vectordb/collection/vikingdb_clients.py @@ -5,6 +5,7 @@ import requests +import openviking from openviking_cli.utils.logger import default_logger as logger # Default request timeout (seconds) @@ -82,6 +83,7 @@ def do_req( headers = { "Accept": "application/json", "Content-Type": "application/json", + "User-Agent": f"openviking/{openviking.__version__}", } headers.update(self.headers) diff --git a/openviking/storage/vectordb/collection/vikingdb_collection.py b/openviking/storage/vectordb/collection/vikingdb_collection.py index 94577e27ca..ddeaa1e7c5 100644 --- a/openviking/storage/vectordb/collection/vikingdb_collection.py +++ b/openviking/storage/vectordb/collection/vikingdb_collection.py @@ -322,6 +322,8 @@ def search_by_keywords( offset: int = 0, filters: Optional[Dict[str, Any]] = None, output_fields: Optional[List[str]] = None, + mode: Optional[str] = None, + fields: Optional[List[str]] = None, ) -> SearchResult: path = "/api/vikingdb/data/search/keywords" data = { @@ -334,7 +336,10 @@ def search_by_keywords( "output_fields": output_fields, "limit": limit, "offset": offset, + "mode": mode, + "fields": fields, } + data = {k: v for k, v in data.items() if v is not None} resp_data = self._data_post(path, data) return self._parse_search_result(resp_data) diff --git a/openviking/storage/vectordb/collection/volcengine_api_key_collection.py b/openviking/storage/vectordb/collection/volcengine_api_key_collection.py index 975fc4c6a0..f432d683ee 100644 --- a/openviking/storage/vectordb/collection/volcengine_api_key_collection.py +++ b/openviking/storage/vectordb/collection/volcengine_api_key_collection.py @@ -361,6 +361,8 @@ def search_by_keywords( offset: int = 0, filters: Optional[Dict[str, Any]] = None, output_fields: Optional[List[str]] = None, + mode: Optional[str] = None, + fields: Optional[List[str]] = None, ) -> SearchResult: path = "/api/vikingdb/data/search/keywords" data = { @@ -371,7 +373,10 @@ def search_by_keywords( "output_fields": output_fields, "limit": limit, "offset": offset, + "mode": mode, + "fields": fields, } + data = {k: v for k, v in data.items() if v is not None} resp_data = self._data_post(path, data) return self._parse_search_result(resp_data) diff --git a/openviking/storage/vectordb/collection/volcengine_clients.py b/openviking/storage/vectordb/collection/volcengine_clients.py index 6947e1791e..aa28c64e3d 100644 --- a/openviking/storage/vectordb/collection/volcengine_clients.py +++ b/openviking/storage/vectordb/collection/volcengine_clients.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: AGPL-3.0 import json +import openviking import requests # type: ignore from volcengine.auth.SignerV4 import SignerV4 from volcengine.base.Request import Request @@ -46,6 +47,7 @@ def prepare_request(self, method, params=None, data=None): "Accept": "application/json", "Content-Type": "application/json", "Host": self.host, + "User-Agent": f"openviking/{openviking.__version__}", } r.set_headers(mheaders) if params: @@ -109,6 +111,7 @@ def prepare_request(self, method, path, params=None, data=None): "Accept": "application/json", "Content-Type": "application/json", "Host": self.host, + "User-Agent": f"openviking/{openviking.__version__}", } r.set_headers(mheaders) if params: @@ -168,6 +171,7 @@ def prepare_request(self, method, path, params=None, data=None): "Content-Type": "application/json", "Host": self.host, "Authorization": f"Bearer {self.api_key}", + "User-Agent": f"openviking/{openviking.__version__}", } r.set_headers(mheaders) if params: diff --git a/openviking/storage/vectordb/collection/volcengine_collection.py b/openviking/storage/vectordb/collection/volcengine_collection.py index 45e58aef87..22485ae0d3 100644 --- a/openviking/storage/vectordb/collection/volcengine_collection.py +++ b/openviking/storage/vectordb/collection/volcengine_collection.py @@ -560,6 +560,8 @@ def search_by_keywords( offset: int = 0, filters: Optional[Dict[str, Any]] = None, output_fields: Optional[List[str]] = None, + mode: Optional[str] = None, + fields: Optional[List[str]] = None, ) -> SearchResult: path = "/api/vikingdb/data/search/keywords" data = { @@ -572,7 +574,10 @@ def search_by_keywords( "output_fields": output_fields, "limit": limit, "offset": offset, + "mode": mode, + "fields": fields, } + data = {k: v for k, v in data.items() if v is not None} resp_data = self._data_post(path, data) return self._parse_search_result(resp_data) diff --git a/openviking/storage/vectordb/service/app_models.py b/openviking/storage/vectordb/service/app_models.py index 81a622c609..d15a617d35 100644 --- a/openviking/storage/vectordb/service/app_models.py +++ b/openviking/storage/vectordb/service/app_models.py @@ -167,6 +167,8 @@ class SearchByKeywordsRequest(BaseModel): output_fields: Optional[Any] = Field(None, description="Output fields") limit: Optional[int] = Field(10, description="Result limit") offset: Optional[int] = Field(0, description="Result offset") + mode: Optional[str] = Field(None, description="Search mode, e.g. bm25") + fields: Optional[Any] = Field(None, description="Text fields to search in") # ==================== Response Model ==================== diff --git a/openviking/storage/vectordb/utils/validation.py b/openviking/storage/vectordb/utils/validation.py index 887d059589..378756d378 100644 --- a/openviking/storage/vectordb/utils/validation.py +++ b/openviking/storage/vectordb/utils/validation.py @@ -152,6 +152,7 @@ class CollectionMetaConfig(BaseModel): ProjectName: Optional[str] = None Description: Optional[str] = Field(None, max_length=65535) Vectorize: Optional[VectorizeConfig] = None + FullText: Optional[List[dict]] = None # e.g. [{"Field": "content", "Analyzer": {"Tokenizer": "standard"}}] # Internal fields _FieldsCount: Optional[int] = None diff --git a/openviking/storage/vectordb_adapters/base.py b/openviking/storage/vectordb_adapters/base.py index 99f1b36593..fc9806e005 100644 --- a/openviking/storage/vectordb_adapters/base.py +++ b/openviking/storage/vectordb_adapters/base.py @@ -523,6 +523,41 @@ def count(self, filter: Optional[Dict[str, Any] | FilterExpr] = None) -> int: return 0 + def search_by_keywords( + self, + keywords: Optional[list[str]] = None, + query: Optional[str] = None, + limit: int = 10, + offset: int = 0, + filter: Optional[Dict[str, Any] | FilterExpr] = None, + output_fields: Optional[list[str]] = None, + mode: Optional[str] = None, + fields: Optional[list[str]] = None, + ) -> list[Dict[str, Any]]: + coll = self.get_collection() + result = coll.search_by_keywords( + index_name=self._index_name, + keywords=keywords, + query=query, + limit=limit, + offset=offset, + filters=self._compile_filter(filter), + output_fields=output_fields, + mode=mode, + fields=fields, + ) + records: list[Dict[str, Any]] = [] + for item in result.data: + record = dict(item.fields) if item.fields else {} + record["id"] = item.id + raw_score = item.score if item.score is not None else 0.0 + if not math.isfinite(raw_score): + raw_score = 0.0 + record["_score"] = raw_score + record = self._normalize_record_for_read(record) + records.append(record) + return records + def clear(self) -> bool: self.get_collection().delete_all_data() return True diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index 8f6a5ea93b..ea23bc9dd4 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -17,6 +17,7 @@ import hashlib import json import re +import time from contextlib import contextmanager from dataclasses import dataclass, field from datetime import datetime, timezone @@ -41,7 +42,7 @@ from openviking.resource.watch_storage import is_watch_task_control_uri from openviking.server.error_mapping import is_not_found_error, map_exception from openviking.server.identity import RequestContext, Role -from openviking.storage.expr import PathScope +from openviking.storage.expr import And, Eq, PathScope from openviking.telemetry import get_current_telemetry from openviking.utils.time_utils import format_simplified, get_current_timestamp, parse_iso_datetime from openviking_cli.exceptions import ( @@ -227,6 +228,7 @@ def __init__( self.vector_store = vector_store self.retrieval_config = retrieval_config self._encryptor = encryptor + self._count_cache: Dict[str, tuple] = {} # uri → (count, timestamp) self._bound_ctx: contextvars.ContextVar[Optional[RequestContext]] = contextvars.ContextVar( "vikingfs_bound_ctx", default=None ) @@ -686,11 +688,16 @@ async def grep( node_limit: Optional[int] = None, level_limit: int = 5, ctx: Optional[RequestContext] = None, + engine: str = "auto", + switch_to_remote_threshold: int = 1000, + remote_return_limit: int = 100, ) -> Dict: """Content search by pattern or keywords. Optimized implementation that uses agfs native grep when possible. Falls back to VikingFS layer implementation for encrypted files. + When engine="auto" and vikingdb is available with sufficient data, + uses vikingdb bm25 recall + local fs precise matching. Args: uri: Viking URI @@ -700,6 +707,11 @@ async def grep( node_limit: Maximum number of results to return level_limit: Maximum depth level to traverse (default: 5) ctx: Request context + engine: Search engine mode: "auto" (default) or "fs" + switch_to_remote_threshold: L2 record count threshold to switch to + vikingdb; 0 means always use vikingdb (default: 1000) + remote_return_limit: Maximum files recalled by vikingdb bm25 + (default: 100, max: 100000) Returns: Dict with matches, count, match_count, files_scanned @@ -707,8 +719,15 @@ async def grep( self._ensure_access(uri, ctx) await self.stat(uri, ctx=ctx) - if self._encryptor: - return await self._grep_encrypted( + # Clamp remote_return_limit to valid range + remote_return_limit = max(1, min(remote_return_limit, 100000)) + + resolved_engine = await self._resolve_grep_engine( + engine, uri, ctx, switch_to_remote_threshold + ) + + if resolved_engine == "fs": + return await self._grep_fs( uri=uri, pattern=pattern, exclude_uri=exclude_uri, @@ -717,30 +736,203 @@ async def grep( level_limit=level_limit, ctx=ctx, ) - - try: - return await self._grep_with_agfs( + else: # "vikingdb_then_fs" + return await self._grep_vikingdb_then_fs( uri=uri, pattern=pattern, exclude_uri=exclude_uri, case_insensitive=case_insensitive, node_limit=node_limit, level_limit=level_limit, + remote_return_limit=remote_return_limit, ctx=ctx, ) + + async def _resolve_grep_engine(self, engine: str, uri: str, ctx, + switch_to_remote_threshold: int = 1000) -> str: + """Resolve the actual grep engine to use.""" + if engine == "fs": + return "fs" + + # auto mode: check vikingdb availability + vector_store = self._get_vector_store() + if not vector_store: + return "fs" + + backend_type = getattr(vector_store, "_backend_type", "unknown") + if backend_type not in ("volcengine", "vikingdb"): + return "fs" + + # Check collection has content field and FullText config + if not await self._collection_has_fulltext(vector_store, ctx): + return "fs" + + # switch_to_remote_threshold=0 means always use vikingdb + if switch_to_remote_threshold == 0: + return "vikingdb_then_fs" + + # Check data volume threshold + try: + count = await self._get_cached_count(uri, ctx) + if count < switch_to_remote_threshold: + return "fs" + except Exception: + logger.debug("grep engine=auto: count() check failed, falling back to fs", exc_info=True) + return "fs" + + return "vikingdb_then_fs" + + async def _collection_has_fulltext(self, vector_store, ctx) -> bool: + """Check if collection has content field and FullText config.""" + try: + meta = None + if hasattr(vector_store, "get_collection_meta"): + meta = await vector_store.get_collection_meta(ctx=ctx) + if not meta: + return False + fields = meta.get("Fields", []) + has_content = any( + f.get("FieldName") == "content" and f.get("FieldType") == "text" + for f in fields + ) + fulltext = meta.get("FullText") or [] + has_content_fulltext = any( + ft.get("Field") == "content" for ft in fulltext + ) + return has_content and has_content_fulltext + except Exception: + logger.debug("Failed to check collection fulltext config, assuming no fulltext", exc_info=True) + return False + + async def _get_cached_count(self, uri: str, ctx) -> int: + """Get cached count of L2 records for a URI (TTL=60s).""" + _COUNT_CACHE_TTL = 60 + vector_store = self._get_vector_store() + + # Include account_id in cache key for multi-tenant safety + account_id = getattr(ctx, "account_id", None) if ctx else None + cache_key = f"{account_id}:{uri}" if account_id else uri + + now = time.time() + cached = self._count_cache.get(cache_key) + if cached and (now - cached[1]) < _COUNT_CACHE_TTL: + return cached[0] + + count = await vector_store.count( + filter=And([PathScope("uri", uri), Eq("level", 2)]), ctx=ctx + ) + self._count_cache[cache_key] = (count, now) + return count + + async def _grep_fs(self, uri, pattern, exclude_uri, case_insensitive, + node_limit, level_limit, ctx): + """Existing fs path (renamed from original inline logic).""" + if self._encryptor: + return await self._grep_encrypted( + uri=uri, pattern=pattern, exclude_uri=exclude_uri, + case_insensitive=case_insensitive, node_limit=node_limit, + level_limit=level_limit, ctx=ctx, + ) + try: + return await self._grep_with_agfs( + uri=uri, pattern=pattern, exclude_uri=exclude_uri, + case_insensitive=case_insensitive, node_limit=node_limit, + level_limit=level_limit, ctx=ctx, + ) except (AttributeError, AGFSNotSupportedError, NotImplementedError) as e: logger.debug(f"agfs grep unavailable, falling back to VikingFS implementation: {e}") return await self._grep_encrypted( - uri=uri, - pattern=pattern, - exclude_uri=exclude_uri, - case_insensitive=case_insensitive, - node_limit=node_limit, - level_limit=level_limit, - ctx=ctx, + uri=uri, pattern=pattern, exclude_uri=exclude_uri, + case_insensitive=case_insensitive, node_limit=node_limit, + level_limit=level_limit, ctx=ctx, ) + async def _grep_vikingdb_then_fs(self, uri, pattern, exclude_uri, + case_insensitive, node_limit, + level_limit, remote_return_limit, ctx): + """VikingDB bm25 recall + local fs precise matching.""" + vector_store = self._get_vector_store() + + # Step 1: vikingdb recall candidate files + try: + filter_expr = And([ + PathScope("uri", uri), + Eq("level", 2), + ]) + result = await vector_store.search_by_keywords( + keywords=[pattern], + mode="bm25", + fields=["content"], + limit=remote_return_limit, + filter=filter_expr, + output_fields=["uri"], + ctx=ctx, + ) + except Exception as e: + logger.warning(f"grep vikingdb step failed, falling back to fs: {e}") + return await self._grep_fs( + uri=uri, pattern=pattern, exclude_uri=exclude_uri, + case_insensitive=case_insensitive, node_limit=node_limit, + level_limit=level_limit, ctx=ctx, + ) + + candidate_uris = [ + r["uri"] for r in result + if r.get("uri") + ] + if exclude_uri: + candidate_uris = [u for u in candidate_uris if not u.startswith(exclude_uri)] + if not candidate_uris: + # BM25 returned no candidates — fall back to fs to avoid missing matches + # (regex patterns may not work well as BM25 keywords) + return await self._grep_fs( + uri=uri, pattern=pattern, exclude_uri=exclude_uri, + case_insensitive=case_insensitive, node_limit=node_limit, + level_limit=level_limit, ctx=ctx, + ) + + # Step 2: local fs precise matching on candidate files + return await self._grep_in_files( + candidate_uris, pattern, case_insensitive, node_limit, ctx, + ) + + async def _grep_in_files(self, file_uris: List[str], pattern: str, + case_insensitive: bool, node_limit: Optional[int], + ctx: Optional[RequestContext]) -> Dict: + """Execute regex matching in specified file list (vikingdb_then_fs Step 2).""" + flags = re.IGNORECASE if case_insensitive else 0 + compiled = re.compile(pattern, flags) + + results = [] + files_scanned = 0 + + for file_uri in file_uris: + files_scanned += 1 + try: + content_bytes = await self.read(file_uri, ctx=ctx) + content = content_bytes.decode("utf-8", errors="replace") + except Exception: + continue + + for line_no, line in enumerate(content.splitlines(), 1): + if compiled.search(line): + results.append({"uri": file_uri, "line": line_no, "content": line}) + if node_limit and len(results) >= node_limit: + return { + "matches": results, + "count": len(results), + "match_count": len(results), + "files_scanned": files_scanned, + } + + return { + "matches": results, + "count": len(results), + "match_count": len(results), + "files_scanned": files_scanned, + } + async def _grep_with_agfs( self, uri: str, diff --git a/openviking/storage/viking_vector_index_backend.py b/openviking/storage/viking_vector_index_backend.py index 02fff3bca5..adde1b7def 100644 --- a/openviking/storage/viking_vector_index_backend.py +++ b/openviking/storage/viking_vector_index_backend.py @@ -481,6 +481,42 @@ async def optimize(self) -> bool: logger.info("Optimization requested") return True + async def search_by_keywords( + self, + keywords: Optional[List[str]] = None, + query: Optional[str] = None, + limit: int = 10, + offset: int = 0, + filter: Optional[Dict[str, Any] | FilterExpr] = None, + output_fields: Optional[List[str]] = None, + mode: Optional[str] = None, + fields: Optional[List[str]] = None, + ) -> List[Dict[str, Any]]: + try: + if self._bound_account_id: + account_filter = Eq("account_id", self._bound_account_id) + if filter: + if isinstance(filter, dict): + filter = RawDSL(filter) + filter = And([account_filter, filter]) + else: + filter = account_filter + + return await asyncio.to_thread( + self._adapter.search_by_keywords, + keywords=keywords, + query=query, + limit=limit, + offset=offset, + filter=filter, + output_fields=output_fields, + mode=mode, + fields=fields, + ) + except Exception as e: + logger.error("Error searching by keywords: %s", e) + return [] + async def close(self) -> None: try: await self._async_adapter.call("close") @@ -534,6 +570,7 @@ def __init__(self, config: Optional[VectorDBBackendConfig]): init_cpp_logging() self._config = config + self._backend_type = config.backend # expose for engine resolution self.vector_dim = config.dimension self.distance_metric = config.distance_metric self.sparse_weight = config.sparse_weight @@ -619,8 +656,16 @@ async def collection_exists_bound(self) -> bool: async def get_collection_info(self) -> Optional[Dict[str, Any]]: return await self._get_default_backend().get_collection_info() - async def get_collection_meta(self) -> Optional[Dict[str, Any]]: - return await self._get_default_backend().get_collection_meta() + async def get_collection_meta( + self, + *, + ctx: Optional[RequestContext] = None, + ) -> Optional[Dict[str, Any]]: + if ctx: + backend = self._get_backend_for_context(ctx) + else: + backend = self._get_default_backend() + return await backend.get_collection_meta() async def update_collection_description(self, description: str) -> bool: return await self._get_default_backend().update_collection_description(description) @@ -757,6 +802,34 @@ async def count( backend = self._get_default_backend() return await backend.count(filter=filter) + async def search_by_keywords( + self, + keywords: Optional[List[str]] = None, + query: Optional[str] = None, + limit: int = 10, + offset: int = 0, + filter: Optional[Dict[str, Any] | FilterExpr] = None, + output_fields: Optional[List[str]] = None, + mode: Optional[str] = None, + fields: Optional[List[str]] = None, + *, + ctx: Optional[RequestContext] = None, + ) -> List[Dict[str, Any]]: + if ctx: + backend = self._get_backend_for_context(ctx) + else: + backend = self._get_default_backend() + return await backend.search_by_keywords( + keywords=keywords, + query=query, + limit=limit, + offset=offset, + filter=filter, + output_fields=output_fields, + mode=mode, + fields=fields, + ) + async def clear(self, *, ctx: Optional[RequestContext] = None) -> bool: if ctx: backend = self._get_backend_for_context(ctx) diff --git a/openviking/sync_client.py b/openviking/sync_client.py index d6bd4ed2c5..65ec648115 100644 --- a/openviking/sync_client.py +++ b/openviking/sync_client.py @@ -366,10 +366,16 @@ def grep( case_insensitive: bool = False, node_limit: Optional[int] = None, exclude_uri: Optional[str] = None, + engine: str = "auto", + switch_to_remote_threshold: int = 1000, + remote_return_limit: int = 100, ) -> Dict: """Content search""" return run_async( - self._async_client.grep(uri, pattern, case_insensitive, node_limit, exclude_uri) + self._async_client.grep( + uri, pattern, case_insensitive, node_limit, exclude_uri, + engine, switch_to_remote_threshold, remote_return_limit, + ) ) def glob(self, pattern: str, uri: str = "viking://") -> Dict: diff --git a/openviking_cli/client/base.py b/openviking_cli/client/base.py index 9d47dc7e14..a933444743 100644 --- a/openviking_cli/client/base.py +++ b/openviking_cli/client/base.py @@ -194,6 +194,9 @@ async def grep( case_insensitive: bool = False, exclude_uri: Optional[str] = None, node_limit: Optional[int] = None, + engine: str = "auto", + switch_to_remote_threshold: int = 1000, + remote_return_limit: int = 100, ) -> Dict[str, Any]: """Content search with pattern.""" ... diff --git a/openviking_cli/client/http.py b/openviking_cli/client/http.py index 033f792ae3..46d832a390 100644 --- a/openviking_cli/client/http.py +++ b/openviking_cli/client/http.py @@ -740,6 +740,9 @@ async def grep( case_insensitive: bool = False, node_limit: Optional[int] = None, exclude_uri: Optional[str] = None, + engine: str = "auto", + switch_to_remote_threshold: int = 1000, + remote_return_limit: int = 100, ) -> Dict[str, Any]: """Content search with pattern.""" uri = VikingURI.normalize(uri) @@ -747,6 +750,9 @@ async def grep( "uri": uri, "pattern": pattern, "case_insensitive": case_insensitive, + "engine": engine, + "switch_to_remote_threshold": switch_to_remote_threshold, + "remote_return_limit": remote_return_limit, } if node_limit is not None: request_json["node_limit"] = node_limit diff --git a/openviking_cli/client/sync_http.py b/openviking_cli/client/sync_http.py index a586321c88..f868181d01 100644 --- a/openviking_cli/client/sync_http.py +++ b/openviking_cli/client/sync_http.py @@ -313,10 +313,16 @@ def grep( case_insensitive: bool = False, node_limit: Optional[int] = None, exclude_uri: Optional[str] = None, + engine: str = "auto", + switch_to_remote_threshold: int = 1000, + remote_return_limit: int = 100, ) -> Dict: """Content search with pattern.""" return run_async( - self._async_client.grep(uri, pattern, case_insensitive, node_limit, exclude_uri) + self._async_client.grep( + uri, pattern, case_insensitive, node_limit, exclude_uri, + engine, switch_to_remote_threshold, remote_return_limit, + ) ) def glob(self, pattern: str, uri: str = "viking://") -> Dict: diff --git a/tests/storage/mock_backend.py b/tests/storage/mock_backend.py index 07e28d50fd..7dd67d919c 100644 --- a/tests/storage/mock_backend.py +++ b/tests/storage/mock_backend.py @@ -110,6 +110,8 @@ def search_by_keywords( offset: int = 0, filters: Optional[Dict[str, Any]] = None, output_fields: Optional[List[str]] = None, + mode: Optional[str] = None, + fields: Optional[List[str]] = None, ) -> SearchResult: raise NotImplementedError("MockCollection.search_by_keywords is not supported") diff --git a/tests/storage/test_collection_schemas.py b/tests/storage/test_collection_schemas.py index 8dbe0d48bb..07b54afc8e 100644 --- a/tests/storage/test_collection_schemas.py +++ b/tests/storage/test_collection_schemas.py @@ -176,7 +176,10 @@ async def update_collection_description(self, description): @pytest.mark.asyncio -async def test_init_context_collection_rejects_mismatched_nonempty_collection(monkeypatch): +async def test_init_context_collection_warns_on_mismatched_nonempty_collection(monkeypatch): + """When embedding metadata mismatches for a non-empty collection, the function + logs a warning and returns False (does not raise).""" + class _FakeStorage: async def create_collection(self, name, schema): del name, schema @@ -204,8 +207,8 @@ async def update_collection_description(self, description): # pragma: no cover lambda: config, ) - with pytest.raises(EmbeddingRebuildRequiredError, match="Rebuild is required"): - await init_context_collection(_FakeStorage()) + result = await init_context_collection(_FakeStorage()) + assert result is False def test_build_embedding_metadata_hashes_resolved_local_model_path(tmp_path): diff --git a/tests/storage/test_rebuild_schema.py b/tests/storage/test_rebuild_schema.py index df40ef57b5..d170507057 100644 --- a/tests/storage/test_rebuild_schema.py +++ b/tests/storage/test_rebuild_schema.py @@ -3,8 +3,13 @@ from openviking.storage.collection_schemas import CollectionSchemas -def test_context_collection_does_not_contain_rebuild_content_snapshot_field(): +def test_context_collection_contains_content_field_for_fulltext(): schema = CollectionSchemas.context_collection("ctx", 8) field_names = {field["FieldName"] for field in schema["Fields"]} - assert "content" not in field_names + # content field is required for VikingDB FullText (bm25) search + assert "content" in field_names + # embedding_content is not a schema field assert "embedding_content" not in field_names + # FullText config must reference the content field + fulltext_fields = [ft["Field"] for ft in schema.get("FullText", [])] + assert "content" in fulltext_fields From af0423e1163b3d9a765a63ac8f567a387b4d285f Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Wed, 20 May 2026 17:47:11 +0800 Subject: [PATCH 02/31] fix(grep): address CI review feedback: max-size eviction to _count_cache, use Literal, Split regex alternation into individual keywords for bm25 (max 10) --- openviking/async_client.py | 4 +++- openviking/client/local.py | 4 +++- openviking/server/routers/search.py | 3 ++- openviking/service/fs_service.py | 4 +++- openviking/storage/viking_fs.py | 22 +++++++++++++++++----- openviking/sync_client.py | 4 +++- openviking_cli/client/base.py | 4 +++- openviking_cli/client/http.py | 4 +++- openviking_cli/client/sync_http.py | 4 +++- 9 files changed, 40 insertions(+), 13 deletions(-) diff --git a/openviking/async_client.py b/openviking/async_client.py index a694a2a2da..81df165f26 100644 --- a/openviking/async_client.py +++ b/openviking/async_client.py @@ -11,6 +11,8 @@ import threading from typing import Any, Dict, List, Optional, Union +from openviking.storage.viking_fs import GrepEngine + from openviking.client import LocalClient, Session from openviking.service.debug_service import SystemStatus from openviking.telemetry import TelemetryRequest @@ -475,7 +477,7 @@ async def grep( case_insensitive: bool = False, node_limit: Optional[int] = None, exclude_uri: Optional[str] = None, - engine: str = "auto", + engine: GrepEngine = "auto", switch_to_remote_threshold: int = 1000, remote_return_limit: int = 100, ) -> Dict: diff --git a/openviking/client/local.py b/openviking/client/local.py index 640fd6a7d5..687f74d74f 100644 --- a/openviking/client/local.py +++ b/openviking/client/local.py @@ -7,6 +7,8 @@ from typing import Any, Dict, List, Optional, Union +from openviking.storage.viking_fs import GrepEngine + from openviking.server.identity import RequestContext, Role from openviking.service import OpenVikingService from openviking.telemetry import TelemetryRequest @@ -371,7 +373,7 @@ async def grep( node_limit: Optional[int] = None, exclude_uri: Optional[str] = None, level_limit: int = 5, - engine: str = "auto", + engine: GrepEngine = "auto", switch_to_remote_threshold: int = 1000, remote_return_limit: int = 100, ) -> Dict[str, Any]: diff --git a/openviking/server/routers/search.py b/openviking/server/routers/search.py index df8eb5fbe0..d559ab0163 100644 --- a/openviking/server/routers/search.py +++ b/openviking/server/routers/search.py @@ -11,6 +11,7 @@ from openviking.core.path_variables import resolve_path_variables from openviking.pyagfs.exceptions import AGFSClientError, AGFSNotFoundError from openviking.server.auth import get_request_context +from openviking.storage.viking_fs import GrepEngine from openviking.server.dependencies import get_service from openviking.server.error_mapping import map_exception from openviking.server.identity import RequestContext @@ -111,7 +112,7 @@ class GrepRequest(BaseModel): case_insensitive: bool = False node_limit: Optional[int] = None level_limit: int = 5 - engine: Literal["auto", "fs"] = "auto" + engine: GrepEngine = "auto" switch_to_remote_threshold: int = Field(default=1000, ge=0, description="L2 record count threshold to switch to vikingdb; 0 means always use vikingdb") remote_return_limit: int = Field(default=100, ge=1, le=100000, description="Maximum files recalled by vikingdb bm25") diff --git a/openviking/service/fs_service.py b/openviking/service/fs_service.py index 3a52210add..15fd2618f8 100644 --- a/openviking/service/fs_service.py +++ b/openviking/service/fs_service.py @@ -8,6 +8,8 @@ from typing import Any, Dict, List, Optional +from openviking.storage.viking_fs import GrepEngine + from openviking.core.namespace import context_type_for_uri from openviking.core.uri_validation import validate_optional_viking_uri, validate_viking_uri from openviking.privacy import ( @@ -241,7 +243,7 @@ async def grep( case_insensitive: bool = False, node_limit: Optional[int] = None, level_limit: int = 5, - engine: str = "auto", + engine: GrepEngine = "auto", switch_to_remote_threshold: int = 1000, remote_return_limit: int = 100, ) -> Dict: diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index ea23bc9dd4..b11f09c946 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -22,7 +22,10 @@ from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import PurePath -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union + +# Grep engine mode type alias — import this instead of repeating Literal["auto", "fs"] +GrepEngine = Literal["auto", "fs"] from openviking.core.namespace import ( NamespaceShapeError, @@ -228,7 +231,8 @@ def __init__( self.vector_store = vector_store self.retrieval_config = retrieval_config self._encryptor = encryptor - self._count_cache: Dict[str, tuple] = {} # uri → (count, timestamp) + self._count_cache: Dict[str, tuple] = {} # cache_key → (count, timestamp) + self._count_cache_max_size = 1024 self._bound_ctx: contextvars.ContextVar[Optional[RequestContext]] = contextvars.ContextVar( "vikingfs_bound_ctx", default=None ) @@ -688,7 +692,7 @@ async def grep( node_limit: Optional[int] = None, level_limit: int = 5, ctx: Optional[RequestContext] = None, - engine: str = "auto", + engine: GrepEngine = "auto", switch_to_remote_threshold: int = 1000, remote_return_limit: int = 100, ) -> Dict: @@ -748,7 +752,7 @@ async def grep( ctx=ctx, ) - async def _resolve_grep_engine(self, engine: str, uri: str, ctx, + async def _resolve_grep_engine(self, engine: GrepEngine, uri: str, ctx, switch_to_remote_threshold: int = 1000) -> str: """Resolve the actual grep engine to use.""" if engine == "fs": @@ -821,6 +825,11 @@ async def _get_cached_count(self, uri: str, ctx) -> int: count = await vector_store.count( filter=And([PathScope("uri", uri), Eq("level", 2)]), ctx=ctx ) + # Evict oldest entries if cache exceeds max size + if len(self._count_cache) >= self._count_cache_max_size: + oldest_keys = sorted(self._count_cache, key=lambda k: self._count_cache[k][1]) + for k in oldest_keys[:len(oldest_keys) // 2]: + del self._count_cache[k] self._count_cache[cache_key] = (count, now) return count @@ -856,12 +865,15 @@ async def _grep_vikingdb_then_fs(self, uri, pattern, exclude_uri, # Step 1: vikingdb recall candidate files try: + # Split regex alternation (e.g. "error|warning|fail") into individual keywords + # for bm25 search. Limit to 10 keywords per VikingDB API constraint. + keywords = [kw.strip() for kw in pattern.split("|") if kw.strip()][:10] filter_expr = And([ PathScope("uri", uri), Eq("level", 2), ]) result = await vector_store.search_by_keywords( - keywords=[pattern], + keywords=keywords, mode="bm25", fields=["content"], limit=remote_return_limit, diff --git a/openviking/sync_client.py b/openviking/sync_client.py index 65ec648115..d70b16ed75 100644 --- a/openviking/sync_client.py +++ b/openviking/sync_client.py @@ -8,6 +8,8 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from openviking.storage.viking_fs import GrepEngine + if TYPE_CHECKING: from openviking.session import Session @@ -366,7 +368,7 @@ def grep( case_insensitive: bool = False, node_limit: Optional[int] = None, exclude_uri: Optional[str] = None, - engine: str = "auto", + engine: GrepEngine = "auto", switch_to_remote_threshold: int = 1000, remote_return_limit: int = 100, ) -> Dict: diff --git a/openviking_cli/client/base.py b/openviking_cli/client/base.py index a933444743..7068a9eb6c 100644 --- a/openviking_cli/client/base.py +++ b/openviking_cli/client/base.py @@ -8,6 +8,8 @@ from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional, Union +from openviking.storage.viking_fs import GrepEngine + from openviking.telemetry import TelemetryRequest @@ -194,7 +196,7 @@ async def grep( case_insensitive: bool = False, exclude_uri: Optional[str] = None, node_limit: Optional[int] = None, - engine: str = "auto", + engine: GrepEngine = "auto", switch_to_remote_threshold: int = 1000, remote_return_limit: int = 100, ) -> Dict[str, Any]: diff --git a/openviking_cli/client/http.py b/openviking_cli/client/http.py index 46d832a390..cb729a89a1 100644 --- a/openviking_cli/client/http.py +++ b/openviking_cli/client/http.py @@ -11,6 +11,8 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Union +from openviking.storage.viking_fs import GrepEngine + import httpx from openviking.telemetry import TelemetryRequest, normalize_telemetry_request @@ -740,7 +742,7 @@ async def grep( case_insensitive: bool = False, node_limit: Optional[int] = None, exclude_uri: Optional[str] = None, - engine: str = "auto", + engine: GrepEngine = "auto", switch_to_remote_threshold: int = 1000, remote_return_limit: int = 100, ) -> Dict[str, Any]: diff --git a/openviking_cli/client/sync_http.py b/openviking_cli/client/sync_http.py index f868181d01..445c6f8f57 100644 --- a/openviking_cli/client/sync_http.py +++ b/openviking_cli/client/sync_http.py @@ -7,6 +7,8 @@ from typing import Any, Dict, List, Optional, Union +from openviking.storage.viking_fs import GrepEngine + from openviking.telemetry import TelemetryRequest from openviking_cli.client.http import AsyncHTTPClient from openviking_cli.utils import run_async @@ -313,7 +315,7 @@ def grep( case_insensitive: bool = False, node_limit: Optional[int] = None, exclude_uri: Optional[str] = None, - engine: str = "auto", + engine: GrepEngine = "auto", switch_to_remote_threshold: int = 1000, remote_return_limit: int = 100, ) -> Dict: From 0a4f7c361ade1354c767f88b24d8126894b434da Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Thu, 21 May 2026 11:38:49 +0800 Subject: [PATCH 03/31] fix(schema): use dynamic __version__ for schema_version and handle dev suffixes in version comparison --- openviking/storage/collection_schemas.py | 30 ++++++++++++++++++++---- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/openviking/storage/collection_schemas.py b/openviking/storage/collection_schemas.py index cb44712c0d..5fdd4ba8aa 100644 --- a/openviking/storage/collection_schemas.py +++ b/openviking/storage/collection_schemas.py @@ -41,11 +41,28 @@ logger = get_logger(__name__) EMBEDDING_META_MARKER = "\n\n[openviking.embedding]\n" +# Minimum OV version that supports content field + FullText config for grep bm25 +_FULLTEXT_MIN_VERSION = "0.3.18" + def _parse_version(v: str) -> tuple: - """Parse a semver-like string into a comparable tuple of ints.""" + """Parse a semver-like string into a comparable tuple of ints. + + Only the first 3 numeric segments are used (e.g. "0.3.18.dev23" → (0, 3, 18)). + Non-numeric suffixes like ".dev23", ".rc1", "+local" are ignored. + """ try: - return tuple(int(x) for x in v.split(".")) + parts = v.split(".") + numeric = [] + for p in parts: + # Stop at first non-numeric segment (e.g. "dev23", "rc1") + try: + numeric.append(int(p)) + except ValueError: + break + if len(numeric) == 3: + break + return tuple(numeric) if numeric else (0, 0, 0) except (ValueError, AttributeError): return (0, 0, 0) @@ -176,12 +193,14 @@ def _build_embedding_metadata(config: "OpenVikingConfig") -> Dict[str, Any]: except Exception: model_identity = model + from openviking import __version__ + return { "provider": provider, "model": model, "dimension": dimension, "model_identity": model_identity, - "schema_version": "0.3.18", + "schema_version": __version__, } @@ -265,18 +284,19 @@ async def init_context_collection(storage) -> bool: # Schema compatibility check: warn if collection was created by older OV version if existing_embedding_meta: existing_schema_version = existing_embedding_meta.get("schema_version", "0.0.0") - if _parse_version(existing_schema_version) < _parse_version("0.3.18"): + if _parse_version(existing_schema_version) < _parse_version(_FULLTEXT_MIN_VERSION): fields = existing_meta.get("Fields", []) has_content = any(f.get("FieldName") == "content" and f.get("FieldType") == "text" for f in fields) fulltext = existing_meta.get("FullText") or [] has_content_fulltext = any(ft.get("Field") == "content" for ft in fulltext) if not (has_content and has_content_fulltext): logger.warning( - "Collection schema is outdated (created by OV %s, requires >= 0.3.18). " + "Collection schema is outdated (created by OV %s, requires >= %s). " "Missing 'content' field or FullText config. " "grep engine=auto will fall back to fs. " "Recreate the collection to enable vikingdb-based grep.", existing_schema_version, + _FULLTEXT_MIN_VERSION, ) if existing_embedding_meta == embedding_meta: From 416acedca9ec92b9c6013570827548aceadc5214 Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Thu, 21 May 2026 18:15:26 +0800 Subject: [PATCH 04/31] fix(schema): upsert data to vikingdb lack of content --- openviking/storage/ovpack/vectors.py | 2 ++ .../storage/viking_vector_index_backend.py | 20 ++++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/openviking/storage/ovpack/vectors.py b/openviking/storage/ovpack/vectors.py index 2e2b425a5f..886704745d 100644 --- a/openviking/storage/ovpack/vectors.py +++ b/openviking/storage/ovpack/vectors.py @@ -294,6 +294,8 @@ async def _upsert_vector_snapshot_record( } if not payload.get("abstract"): payload["abstract"] = str(record.get("text") or "") + if "content" not in payload: + payload["content"] = str(record.get("text") or "")[:65536] try: await vector_store.upsert(payload, ctx=ctx) diff --git a/openviking/storage/viking_vector_index_backend.py b/openviking/storage/viking_vector_index_backend.py index adde1b7def..1513bac656 100644 --- a/openviking/storage/viking_vector_index_backend.py +++ b/openviking/storage/viking_vector_index_backend.py @@ -68,6 +68,11 @@ "id", "uri", "level", + "name", + "description", + "tags", + "abstract", + "content", "account_id", ] @@ -155,7 +160,20 @@ def _prepare_upsert_payload(self, data: Dict[str, Any]) -> Dict[str, Any]: """Drop runtime-only or stale legacy fields before writing back to the current schema.""" payload = {k: v for k, v in data.items() if v is not None} filtered = self._filter_known_fields(payload) - return {k: v for k, v in filtered.items() if v is not None} + result = {k: v for k, v in filtered.items() if v is not None} + + # Ensure text fields required by the schema are present (even if empty). + # VikingDB requires all schema-defined fields in upsert data. + try: + coll = self._get_collection() + meta = self._get_meta_data(coll) + for field in meta.get("Fields", []): + if field.get("FieldType") == "text" and field.get("FieldName") not in result: + result[field["FieldName"]] = "" + except Exception: + pass + + return result async def _refresh_meta_data_async(self) -> None: self._meta_data_cache = await self._async_adapter.collection_meta() From 84bdd49ed32d9174ac897f8d902cadc00af46fc8 Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Fri, 22 May 2026 11:15:54 +0800 Subject: [PATCH 05/31] chore: add benchmark for retrieval --- .../grep/vikingdb_bm25/step1_generate.py | 137 ++++++++++++++ .../vikingdb_bm25/step2_quick_add_resource.py | 122 +++++++++++++ .../grep/vikingdb_bm25/step3_build_index.py | 168 ++++++++++++++++++ .../grep/vikingdb_bm25/step4_benchmark.py | 148 +++++++++++++++ 4 files changed, 575 insertions(+) create mode 100644 benchmark/retrieval/grep/vikingdb_bm25/step1_generate.py create mode 100644 benchmark/retrieval/grep/vikingdb_bm25/step2_quick_add_resource.py create mode 100644 benchmark/retrieval/grep/vikingdb_bm25/step3_build_index.py create mode 100644 benchmark/retrieval/grep/vikingdb_bm25/step4_benchmark.py diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step1_generate.py b/benchmark/retrieval/grep/vikingdb_bm25/step1_generate.py new file mode 100644 index 0000000000..92dd4bd79b --- /dev/null +++ b/benchmark/retrieval/grep/vikingdb_bm25/step1_generate.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +"""Generate benchmark data for grep bm25 vs fs comparison. + +Produces ~80,000 markdown files (~50KB each, ~4GB total) in a 4-level +directory tree: + level0: 10 dirs + level1: 10 dirs per level0 (100 total) + level2: 10 dirs per level1 (1,000 total) + level3: 8 dirs per level2 (8,000 total) + files: 10 per level3 dir (80,000 total) + +Target keywords appear in ~1% of files each, simulating a realistic +large-scale codebase where bm25 recall dramatically reduces search scope. +""" +import os +import random + +BASE_DIR = os.path.expanduser("~/.openviking/data/benchmark") + +# Directory tree — each level has independent dir count +LEVEL0_DIRS = 10 +LEVEL1_DIRS = 10 # per level0 dir +LEVEL2_DIRS = 10 # per level1 dir +LEVEL3_DIRS = 8 # per level2 dir (flexible) +FILES_PER_DIR = 10 # per level3 dir + +# Total: 10 * 10 * 10 * 8 * 10 = 80,000 files +# Size: 80,000 * 50KB ≈ 4GB +# Each top-level dir: 10*10*8*10*50KB = 400MB + +TARGET_FILE_SIZE = 50000 # ~50KB + +TARGET_KEYWORDS = ["VikingDB", "FullText", "bm25", "search_by_keywords"] + +FILLER_WORDS = [ + "configuration", "deployment", "architecture", "implementation", + "performance", "optimization", "integration", "middleware", + "authentication", "authorization", "encryption", "validation", + "monitoring", "logging", "caching", "serialization", + "concurrency", "scalability", "reliability", "observability", + "throughput", "latency", "availability", "consistency", + "partitioning", "replication", "failover", "loadbalancing", + "containerization", "orchestration", "provisioning", "lifecycle", +] + +random.seed(42) + +total_files = LEVEL0_DIRS * LEVEL1_DIRS * LEVEL2_DIRS * LEVEL3_DIRS * FILES_PER_DIR +keyword_hit_count = max(1, total_files // 100) # 1% = 800 files per keyword +file_indices = list(range(total_files)) +keyword_files = {} +for kw in TARGET_KEYWORDS: + chosen = random.sample(file_indices, keyword_hit_count) + keyword_files[kw] = set(chosen) + + +def generate_section(title_level): + """Generate a markdown section with realistic filler content.""" + prefix = "#" * title_level + title_words = random.sample(FILLER_WORDS, 3) + title = f"{prefix} {' '.join(title_words).title()}\n\n" + + paragraphs = [] + for _ in range(random.randint(2, 5)): + sentences = [] + for _ in range(random.randint(3, 8)): + words = random.choices(FILLER_WORDS, k=random.randint(8, 15)) + sentences.append(" ".join(words).capitalize() + ".") + paragraphs.append(" ".join(sentences)) + + return title + "\n\n".join(paragraphs) + "\n\n" + + +def generate_file(file_idx): + """Generate a ~50KB markdown file with 3-5 h1 sections, each with 5-10 h2 sections.""" + parts = [] + num_h1 = random.randint(3, 5) + for _ in range(num_h1): + parts.append(generate_section(1)) + num_h2 = random.randint(5, 10) + for _ in range(num_h2): + parts.append(generate_section(2)) + + # Inject target keyword if this file is selected + for kw, indices in keyword_files.items(): + if file_idx in indices: + injection = ( + f"\nThis module provides {kw} integration for advanced search capabilities. " + f"The {kw} feature enables efficient keyword-based retrieval across large datasets.\n\n" + ) + parts[2] = parts[2] + injection # after first h1 + first h2 + + content = "".join(parts) + # Pad to target size if needed + if len(content) < TARGET_FILE_SIZE: + padding_parts = [] + while len("".join(padding_parts)) < TARGET_FILE_SIZE - len(content): + words = random.choices(FILLER_WORDS, k=20) + padding_parts.append(" ".join(words).capitalize() + ".\n") + content += "\n\n## Appendix\n\n" + "".join(padding_parts) + + return content[:TARGET_FILE_SIZE] + + +print(f"Generating {total_files} markdown files under {BASE_DIR}...") +print(f" Tree: level0={LEVEL0_DIRS} x level1={LEVEL1_DIRS} x level2={LEVEL2_DIRS} x level3={LEVEL3_DIRS}") +print(f" Files per leaf dir: {FILES_PER_DIR}") +print(f" Target keywords: {TARGET_KEYWORDS}") +print(f" Each keyword appears in ~{keyword_hit_count} files out of {total_files} " + f"(~{keyword_hit_count / total_files * 100:.1f}%)") +print(f" Estimated total size: ~{total_files * TARGET_FILE_SIZE / 1e9:.1f} GB") + +file_idx = 0 +os.makedirs(BASE_DIR, exist_ok=True) + +for i0 in range(LEVEL0_DIRS): + d0 = os.path.join(BASE_DIR, f"level0_{i0:02d}") + os.makedirs(d0, exist_ok=True) + for i1 in range(LEVEL1_DIRS): + d1 = os.path.join(d0, f"level1_{i1:02d}") + os.makedirs(d1, exist_ok=True) + for i2 in range(LEVEL2_DIRS): + d2 = os.path.join(d1, f"level2_{i2:02d}") + os.makedirs(d2, exist_ok=True) + for i3 in range(LEVEL3_DIRS): + d3 = os.path.join(d2, f"level3_{i3:02d}") + os.makedirs(d3, exist_ok=True) + for f in range(FILES_PER_DIR): + filepath = os.path.join(d3, f"doc_{f:04d}.md") + content = generate_file(file_idx) + with open(filepath, "w") as fh: + fh.write(content) + file_idx += 1 + if file_idx % 10000 == 0: + print(f" ... {file_idx} files written") + +print(f"Done! {file_idx} files generated under {BASE_DIR}") diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step2_quick_add_resource.py b/benchmark/retrieval/grep/vikingdb_bm25/step2_quick_add_resource.py new file mode 100644 index 0000000000..9f054bff20 --- /dev/null +++ b/benchmark/retrieval/grep/vikingdb_bm25/step2_quick_add_resource.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +"""Step 2: Quick upload — import benchmark files skipping VLM+embedding. + +Walks the benchmark directory and uploads each file via the OpenViking Python SDK +with build_index=False, which skips VLM summarization and embedding. This makes +the upload phase fast and avoids circuit-breaker issues from VLM failures. + +After all files are uploaded, run step3_build_index.py to trigger VLM+embedding +in a controlled batch, then step4_benchmark.py to measure grep performance. + +Supports resume: a progress file (.add_resource_progress) tracks completed files. +If interrupted, re-run to automatically skip already-imported files. + +Usage: + python3 step2_quick_add_resource.py [--no-resume] [--max-failures N] +""" +import argparse +import os +import sys + +BASE_DIR = os.path.expanduser("~/.openviking/data/benchmark") +DATA_DIR = os.path.expanduser("~/.openviking/data") +PROGRESS_FILE = os.path.join(BASE_DIR, ".add_resource_progress") + + +def load_progress() -> set: + """Load set of already-imported relative paths from progress file.""" + done = set() + if os.path.exists(PROGRESS_FILE): + with open(PROGRESS_FILE) as f: + for line in f: + line = line.strip() + if line: + done.add(line) + return done + + +def save_progress(rel_path: str) -> None: + """Append a completed relative path to the progress file and flush immediately.""" + with open(PROGRESS_FILE, "a") as f: + f.write(rel_path + "\n") + f.flush() + os.fsync(f.fileno()) + + +def main(): + parser = argparse.ArgumentParser( + description="Step 2: Quick upload benchmark files (skip VLM+embedding)" + ) + parser.add_argument( + "--no-resume", action="store_true", help="Disable auto-resume, start from scratch" + ) + parser.add_argument( + "--max-failures", type=int, default=10, help="Abort after N failures (default: 10)" + ) + args = parser.parse_args() + + from openviking.sync_client import SyncOpenViking + + client = SyncOpenViking() + client.initialize() + + # Collect all files first (deterministic order) + all_files = [] + for root, dirs, files in os.walk(BASE_DIR): + dirs.sort() + for fname in sorted(files): + if fname.endswith(".md"): + all_files.append(os.path.join(root, fname)) + + # Load resume state + done_set = set() + if not args.no_resume: + done_set = load_progress() + if done_set: + print(f"Resuming: {len(done_set)} files already imported (from {PROGRESS_FILE})") + + count = 0 + skipped = 0 + failed = 0 + + for filepath in all_files: + rel = os.path.relpath(filepath, DATA_DIR) + rel_dir = os.path.dirname(rel) + parent_uri = f"viking://resources/{rel_dir}" + + # Skip already-imported files + if rel in done_set: + skipped += 1 + continue + + idx = count + skipped + 1 + print(f"[{idx}/{len(all_files)}] Uploading {rel} ...", end=" ", flush=True) + + try: + client.add_resource( + path=filepath, + parent=parent_uri, + build_index=False, + wait=False, + create_parent=True, + ) + print("OK") + save_progress(rel) + except Exception as e: + print(f"FAILED: {e}") + failed += 1 + if failed >= args.max_failures: + print(f"\nToo many failures ({failed}), aborting. Re-run to resume.") + sys.exit(1) + + count += 1 + if count % 100 == 0: + print(f" ... {count} files uploaded this run ({failed} failed, {skipped} skipped)") + + print(f"\nDone! {count} uploaded, {skipped} skipped, {failed} failed") + if failed == 0: + print("Next step: run step3_build_index.py to trigger VLM+embedding") + + +if __name__ == "__main__": + main() diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step3_build_index.py b/benchmark/retrieval/grep/vikingdb_bm25/step3_build_index.py new file mode 100644 index 0000000000..3daf691208 --- /dev/null +++ b/benchmark/retrieval/grep/vikingdb_bm25/step3_build_index.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 +"""Step 3: Build index — trigger VLM+embedding on already-uploaded files via CLI. + +After step2_quick_add_resource.py uploads all files with build_index=False (skipping +VLM and embedding), this script calls `ov reindex` on each level3 directory to +trigger VLM summarization and embedding in-place, without re-uploading files. + +Uses level3 directory granularity for progress tracking (8000 dirs, ~10 files each), +which gives fine-grained resume capability. + +Usage: + python3 step3_build_index.py [--no-resume] [--mode MODE] [--max-failures N] +""" +import argparse +import os +import shlex +import subprocess +import sys +import time + +BASE_DIR = os.path.expanduser("~/.openviking/data/benchmark") +PROGRESS_FILE = os.path.join(BASE_DIR, ".build_index_progress") +BENCHMARK_URI = "viking://resources/benchmark" + +# Tree structure from step1_generate.py +LEVEL0_DIRS = 10 +LEVEL1_DIRS = 10 +LEVEL2_DIRS = 10 +LEVEL3_DIRS = 8 + + +def discover_level3_dirs() -> list[str]: + """Discover all level3 directories under BASE_DIR (deterministic order).""" + dirs = [] + for i0 in range(LEVEL0_DIRS): + d0 = os.path.join(BASE_DIR, f"level0_{i0:02d}") + if not os.path.isdir(d0): + continue + for i1 in range(LEVEL1_DIRS): + d1 = os.path.join(d0, f"level1_{i1:02d}") + if not os.path.isdir(d1): + continue + for i2 in range(LEVEL2_DIRS): + d2 = os.path.join(d1, f"level2_{i2:02d}") + if not os.path.isdir(d2): + continue + for i3 in range(LEVEL3_DIRS): + d3 = os.path.join(d2, f"level3_{i3:02d}") + if os.path.isdir(d3): + dirs.append(os.path.relpath(d3, BASE_DIR)) + return dirs + + +def load_progress() -> set: + """Load set of already-indexed level3 relative paths from progress file.""" + done = set() + if os.path.exists(PROGRESS_FILE): + with open(PROGRESS_FILE) as f: + for line in f: + line = line.strip() + if line: + done.add(line) + return done + + +def save_progress(rel_path: str) -> None: + """Append a completed level3 relative path to the progress file.""" + with open(PROGRESS_FILE, "a") as f: + f.write(rel_path + "\n") + f.flush() + os.fsync(f.fileno()) + + +def run_cmd(cmd: list[str]) -> tuple[int, str, str, float]: + """Run command, return (returncode, stdout, stderr, elapsed_seconds).""" + t0 = time.time() + result = subprocess.run(cmd, capture_output=True, text=True, timeout=600) + elapsed = time.time() - t0 + return result.returncode, result.stdout, result.stderr, elapsed + + +def main(): + parser = argparse.ArgumentParser( + description="Step 3: Build index — trigger VLM+embedding via ov reindex" + ) + parser.add_argument( + "--no-resume", action="store_true", help="Disable auto-resume, start from scratch" + ) + parser.add_argument( + "--mode", + choices=["vectors_only", "semantic_and_vectors"], + default="vectors_only", + help="Reindex mode (default: vectors_only = embedding)", + ) + parser.add_argument( + "--max-failures", type=int, default=50, help="Abort after N failures (default: 50)" + ) + args = parser.parse_args() + + level3_dirs = discover_level3_dirs() + + if not level3_dirs: + print(f"No level3 directories found under {BASE_DIR}") + print("Did you run step1_generate.py and step2_quick_add_resource.py first?") + sys.exit(1) + + # Load resume state + done_set = set() + if not args.no_resume: + done_set = load_progress() + if done_set: + print(f"Resuming: {len(done_set)} dirs already indexed (from {PROGRESS_FILE})") + + count = 0 + skipped = 0 + failed = 0 + total = len(level3_dirs) + print(f"{total} level3 dirs to index, {len(done_set)} already done") + + for rel_dir in level3_dirs: + if rel_dir in done_set: + skipped += 1 + continue + + uri = f"{BENCHMARK_URI}/{rel_dir}" + cmd = ["ov", "reindex", "--account", "default", "--user", "default", "--mode", args.mode, "--wait", "true", uri] + idx = count + skipped + 1 + cmd_str = shlex.join(cmd) + print(f"[{idx}/{total}] $ {cmd_str}") + + try: + rc, stdout, stderr, elapsed = run_cmd(cmd) + + if stdout.strip(): + for line in stdout.strip().splitlines(): + print(f" {line}") + if stderr.strip(): + for line in stderr.strip().splitlines(): + print(f" [stderr] {line}") + + if rc != 0: + print(f" FAILED (exit={rc}, {elapsed:.1f}s)") + failed += 1 + else: + print(f" OK ({elapsed:.1f}s)") + save_progress(rel_dir) + except subprocess.TimeoutExpired: + print(f" TIMEOUT (600s)") + failed += 1 + except Exception as e: + print(f" ERROR: {e}") + failed += 1 + + if failed >= args.max_failures: + print(f"\nToo many failures ({failed}), aborting. Re-run to resume.") + sys.exit(1) + + count += 1 + if count % 100 == 0: + print(f" ... {count} dirs indexed this run ({failed} failed, {skipped} skipped)") + + print(f"\nDone! {count} dirs indexed, {skipped} skipped, {failed} failed") + if failed == 0: + print("Next step: run step4_benchmark.py to measure grep performance") + + +if __name__ == "__main__": + main() diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step4_benchmark.py b/benchmark/retrieval/grep/vikingdb_bm25/step4_benchmark.py new file mode 100644 index 0000000000..c5d0749cc9 --- /dev/null +++ b/benchmark/retrieval/grep/vikingdb_bm25/step4_benchmark.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 +"""Step 4: Benchmark grep performance: pure fs vs vikingdb bm25 + fs. + +Prerequisites: + 1. Run step1_generate.py to create test data + 2. Run step2_quick_add_resource.py to upload files (skip VLM+embedding) + 3. Run step3_build_index.py to build index (embedding) + +Expected results on 80K files (~4GB): + - engine=fs: tens to hundreds of seconds, frequent timeouts + - engine=auto: under 500ms per query (bm25 recall + local regex filter) + +Usage: + python3 step4_benchmark.py [--runs N] [--warmup N] + +Outputs a comparison table of elapsed time and match count for each query. +""" +import argparse +import shlex +import subprocess +import sys +import time + +BASE_URI = "viking://resources/benchmark" +OV_CMD = ["ov", "--account", "default", "--user", "default"] + +# Test cases: (label, pattern, engine, extra_args) +# Each case is run with `ov grep --uri [extra_args] --engine ` +TEST_CASES = [ + # --- Single keyword, different engines --- + ("fs: single keyword (VikingDB)", "VikingDB", "fs", []), + ("bm25: single keyword (VikingDB)", "VikingDB", "auto", ["--switch-to-remote-threshold", "0"]), + + ("fs: single keyword (FullText)", "FullText", "fs", []), + ("bm25: single keyword (FullText)", "FullText", "auto", ["--switch-to-remote-threshold", "0"]), + + # --- Multi-keyword (regex alternation) --- + ("fs: 2 keywords (VikingDB|FullText)", "VikingDB|FullText", "fs", []), + ("bm25: 2 keywords (VikingDB|FullText)", "VikingDB|FullText", "auto", ["--switch-to-remote-threshold", "0"]), + + ("fs: 3 keywords (VikingDB|FullText|bm25)", "VikingDB|FullText|bm25", "fs", []), + ("bm25: 3 keywords (VikingDB|FullText|bm25)", "VikingDB|FullText|bm25", "auto", ["--switch-to-remote-threshold", "0"]), + + # --- Rare keyword (lower hit count) --- + ("fs: rare keyword (search_by_keywords)", "search_by_keywords", "fs", []), + ("bm25: rare keyword (search_by_keywords)", "search_by_keywords", "auto", ["--switch-to-remote-threshold", "0"]), + + # --- Non-existent keyword (0 matches) --- + ("fs: no-match keyword (zzz_nonexistent)", "zzz_nonexistent", "fs", []), + ("bm25: no-match keyword (zzz_nonexistent)", "zzz_nonexistent", "auto", ["--switch-to-remote-threshold", "0"]), + + # --- Subdirectory scope (narrower URI, ~8K files) --- + ("fs: subdir scope (level0_00)", "VikingDB", "fs", ["--uri", f"{BASE_URI}/level0_00"]), + ("bm25: subdir scope (level0_00)", "VikingDB", "auto", ["--switch-to-remote-threshold", "0", "--uri", f"{BASE_URI}/level0_00"]), + + # --- Different remote_return_limit --- + ("bm25: return_limit=10", "VikingDB", "auto", ["--switch-to-remote-threshold", "0", "--remote-return-limit", "10"]), + ("bm25: return_limit=1000", "VikingDB", "auto", ["--switch-to-remote-threshold", "0", "--remote-return-limit", "1000"]), +] + + +def run_grep(pattern: str, engine: str, extra_args: list) -> tuple[float, int, str, str]: + """Run a single grep command, return (elapsed_seconds, match_count, stdout, stderr).""" + cmd = OV_CMD + [ + "grep", + "--uri", BASE_URI, + "--engine", engine, + ] + extra_args + [pattern] + + cmd_str = shlex.join(cmd) + print(f" $ {cmd_str}") + + start = time.monotonic() + result = subprocess.run(cmd, capture_output=True, text=True) + elapsed = time.monotonic() - start + + match_count = 0 + if result.stdout: + match_count = len([l for l in result.stdout.strip().splitlines() if l.strip()]) + + return elapsed, match_count, result.stdout, result.stderr + + +def main(): + parser = argparse.ArgumentParser(description="Benchmark grep: fs vs bm25") + parser.add_argument("--runs", type=int, default=3, help="Number of runs per test case (default: 3)") + parser.add_argument("--warmup", type=int, default=1, help="Warmup runs before measuring (default: 1)") + args = parser.parse_args() + + print(f"{'Label':<50} {'Engine':<8} {'Avg(ms)':<10} {'Min(ms)':<10} {'Max(ms)':<10} {'Matches':<10}") + print("-" * 108) + + for label, pattern, engine, extra_args in TEST_CASES: + # Resolve URI from extra_args if overridden + uri = BASE_URI + for i, a in enumerate(extra_args): + if a == "--uri" and i + 1 < len(extra_args): + uri = extra_args[i + 1] + + # Warmup runs + for _ in range(args.warmup): + try: + run_grep(pattern, engine, extra_args) + except Exception: + break + + # Measured runs + times = [] + match_count = -1 + last_stdout = "" + last_stderr = "" + failed = False + for _ in range(args.runs): + try: + elapsed, matches, stdout, stderr = run_grep(pattern, engine, extra_args) + times.append(elapsed) + match_count = matches + last_stdout = stdout + last_stderr = stderr + except Exception: + failed = True + break + + if failed: + print(f"{label:<50} {engine:<8} FAILED") + elif not times: + print(f"{label:<50} {engine:<8} NO DATA") + else: + avg_ms = sum(times) / len(times) * 1000 + min_ms = min(times) * 1000 + max_ms = max(times) * 1000 + print(f"{label:<50} {engine:<8} {avg_ms:<10.1f} {min_ms:<10.1f} {max_ms:<10.1f} {match_count:<10}") + + # Print output from last run (compact) + if last_stdout.strip(): + for line in last_stdout.strip().splitlines()[:3]: + print(f" {line}") + if len(last_stdout.strip().splitlines()) > 3: + print(f" ... ({len(last_stdout.strip().splitlines())} lines total)") + if last_stderr.strip(): + for line in last_stderr.strip().splitlines()[:2]: + print(f" [stderr] {line}") + + print() + + +if __name__ == "__main__": + main() From df5a376990e3fe2cb07e7f2209ae0fe3dc48611e Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Mon, 25 May 2026 12:14:40 +0800 Subject: [PATCH 06/31] fix(grep): vikingdb return 200 and no results means no matching content, not necessary to fallback to local fs --- openviking/storage/viking_fs.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index b11f09c946..eafe263890 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -896,13 +896,8 @@ async def _grep_vikingdb_then_fs(self, uri, pattern, exclude_uri, if exclude_uri: candidate_uris = [u for u in candidate_uris if not u.startswith(exclude_uri)] if not candidate_uris: - # BM25 returned no candidates — fall back to fs to avoid missing matches - # (regex patterns may not work well as BM25 keywords) - return await self._grep_fs( - uri=uri, pattern=pattern, exclude_uri=exclude_uri, - case_insensitive=case_insensitive, node_limit=node_limit, - level_limit=level_limit, ctx=ctx, - ) + # BM25 returned no candidates — the index confirms no matching content + return {"matches": [], "count": 0, "match_count": 0, "files_scanned": 0} # Step 2: local fs precise matching on candidate files return await self._grep_in_files( From 9d07865fc038e1f475a666c80a525d3121335b42 Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Mon, 25 May 2026 14:56:57 +0800 Subject: [PATCH 07/31] fix(benchmark): sub uri args; add report --- .../grep/vikingdb_bm25/step4_benchmark.py | 91 +++++++++++++------ 1 file changed, 65 insertions(+), 26 deletions(-) diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step4_benchmark.py b/benchmark/retrieval/grep/vikingdb_bm25/step4_benchmark.py index c5d0749cc9..65ef1d2590 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/step4_benchmark.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/step4_benchmark.py @@ -6,10 +6,6 @@ 2. Run step2_quick_add_resource.py to upload files (skip VLM+embedding) 3. Run step3_build_index.py to build index (embedding) -Expected results on 80K files (~4GB): - - engine=fs: tens to hundreds of seconds, frequent timeouts - - engine=auto: under 500ms per query (bm25 recall + local regex filter) - Usage: python3 step4_benchmark.py [--runs N] [--warmup N] @@ -25,47 +21,56 @@ OV_CMD = ["ov", "--account", "default", "--user", "default"] # Test cases: (label, pattern, engine, extra_args) -# Each case is run with `ov grep --uri [extra_args] --engine ` +# extra_args can override --uri; if present, the default --uri is omitted. TEST_CASES = [ - # --- Single keyword, different engines --- + # --- Single keyword --- ("fs: single keyword (VikingDB)", "VikingDB", "fs", []), ("bm25: single keyword (VikingDB)", "VikingDB", "auto", ["--switch-to-remote-threshold", "0"]), - ("fs: single keyword (FullText)", "FullText", "fs", []), + ("fs: single keyword (FullText)", "FullText", "fs", []), ("bm25: single keyword (FullText)", "FullText", "auto", ["--switch-to-remote-threshold", "0"]), # --- Multi-keyword (regex alternation) --- - ("fs: 2 keywords (VikingDB|FullText)", "VikingDB|FullText", "fs", []), + ("fs: 2 keywords (VikingDB|FullText)", "VikingDB|FullText", "fs", []), ("bm25: 2 keywords (VikingDB|FullText)", "VikingDB|FullText", "auto", ["--switch-to-remote-threshold", "0"]), - ("fs: 3 keywords (VikingDB|FullText|bm25)", "VikingDB|FullText|bm25", "fs", []), + ("fs: 3 keywords (VikingDB|FullText|bm25)", "VikingDB|FullText|bm25", "fs", []), ("bm25: 3 keywords (VikingDB|FullText|bm25)", "VikingDB|FullText|bm25", "auto", ["--switch-to-remote-threshold", "0"]), # --- Rare keyword (lower hit count) --- - ("fs: rare keyword (search_by_keywords)", "search_by_keywords", "fs", []), + ("fs: rare keyword (search_by_keywords)", "search_by_keywords", "fs", []), ("bm25: rare keyword (search_by_keywords)", "search_by_keywords", "auto", ["--switch-to-remote-threshold", "0"]), # --- Non-existent keyword (0 matches) --- - ("fs: no-match keyword (zzz_nonexistent)", "zzz_nonexistent", "fs", []), - ("bm25: no-match keyword (zzz_nonexistent)", "zzz_nonexistent", "auto", ["--switch-to-remote-threshold", "0"]), + ("fs: no-match 1 keyword (zzz_nonexistent)", "zzz_nonexistent", "fs", []), + ("bm25: no-match 1 keyword (zzz_nonexistent)", "zzz_nonexistent", "auto", ["--switch-to-remote-threshold", "0"]), + + ("fs: no-match 2 keywords (zzz_a|zzz_b)", "zzz_a|zzz_b", "fs", []), + ("bm25: no-match 2 keywords (zzz_a|zzz_b)", "zzz_a|zzz_b", "auto", ["--switch-to-remote-threshold", "0"]), + + ("fs: no-match 3 keywords (zzz_a|zzz_b|zzz_c)", "zzz_a|zzz_b|zzz_c", "fs", []), + ("bm25: no-match 3 keywords (zzz_a|zzz_b|zzz_c)", "zzz_a|zzz_b|zzz_c", "auto", ["--switch-to-remote-threshold", "0"]), - # --- Subdirectory scope (narrower URI, ~8K files) --- - ("fs: subdir scope (level0_00)", "VikingDB", "fs", ["--uri", f"{BASE_URI}/level0_00"]), - ("bm25: subdir scope (level0_00)", "VikingDB", "auto", ["--switch-to-remote-threshold", "0", "--uri", f"{BASE_URI}/level0_00"]), + # --- Subdirectory scope (~8K files per level0 dir) --- + ("fs: subdir level0_00, VikingDB (~8K files)", "VikingDB", "fs", ["--uri", f"{BASE_URI}/level0_00"]), + ("bm25: subdir level0_00, VikingDB (~8K files)", "VikingDB", "auto", ["--uri", f"{BASE_URI}/level0_00", "--switch-to-remote-threshold", "0"]), - # --- Different remote_return_limit --- - ("bm25: return_limit=10", "VikingDB", "auto", ["--switch-to-remote-threshold", "0", "--remote-return-limit", "10"]), - ("bm25: return_limit=1000", "VikingDB", "auto", ["--switch-to-remote-threshold", "0", "--remote-return-limit", "1000"]), + ("fs: subdir level0_00, no-match (~8K files)", "zzz_nonexistent", "fs", ["--uri", f"{BASE_URI}/level0_00"]), + ("bm25: subdir level0_00, no-match (~8K files)", "zzz_nonexistent", "auto", ["--uri", f"{BASE_URI}/level0_00", "--switch-to-remote-threshold", "0"]), ] +def _has_uri_arg(extra_args: list) -> bool: + """Check if extra_args contains --uri.""" + return "--uri" in extra_args + + def run_grep(pattern: str, engine: str, extra_args: list) -> tuple[float, int, str, str]: """Run a single grep command, return (elapsed_seconds, match_count, stdout, stderr).""" - cmd = OV_CMD + [ - "grep", - "--uri", BASE_URI, - "--engine", engine, - ] + extra_args + [pattern] + cmd = OV_CMD + ["grep"] + if not _has_uri_arg(extra_args): + cmd += ["--uri", BASE_URI] + cmd += ["--engine", engine] + extra_args + [pattern] cmd_str = shlex.join(cmd) print(f" $ {cmd_str}") @@ -87,8 +92,11 @@ def main(): parser.add_argument("--warmup", type=int, default=1, help="Warmup runs before measuring (default: 1)") args = parser.parse_args() - print(f"{'Label':<50} {'Engine':<8} {'Avg(ms)':<10} {'Min(ms)':<10} {'Max(ms)':<10} {'Matches':<10}") - print("-" * 108) + print(f"{'Label':<50} {'Engine':<8} {'Avg(ms)':<10} {'Min(ms)':<10} {'Max(ms)':<10}") + print("-" * 98) + + # Collect results for summary report: key = scenario name, value = {engine: (avg_ms, match_count)} + results = {} for label, pattern, engine, extra_args in TEST_CASES: # Resolve URI from extra_args if overridden @@ -121,6 +129,7 @@ def main(): failed = True break + avg_ms = 0.0 if failed: print(f"{label:<50} {engine:<8} FAILED") elif not times: @@ -129,7 +138,7 @@ def main(): avg_ms = sum(times) / len(times) * 1000 min_ms = min(times) * 1000 max_ms = max(times) * 1000 - print(f"{label:<50} {engine:<8} {avg_ms:<10.1f} {min_ms:<10.1f} {max_ms:<10.1f} {match_count:<10}") + print(f"{label:<50} {engine:<8} {avg_ms:<10.1f} {min_ms:<10.1f} {max_ms:<10.1f}") # Print output from last run (compact) if last_stdout.strip(): @@ -141,6 +150,36 @@ def main(): for line in last_stderr.strip().splitlines()[:2]: print(f" [stderr] {line}") + # Store result for summary + # Derive scenario name by stripping engine prefix: "fs: xxx" or "bm25: xxx" -> "xxx" + scenario = label.split(": ", 1)[1].strip() if ": " in label else label.strip() + if scenario not in results: + results[scenario] = {} + results[scenario][engine] = (avg_ms, match_count) + + # Print summary report + print() + print("=" * 80) + print("PERFORMANCE REPORT: fs vs bm25 (auto)") + print("=" * 80) + print(f"{'Scenario':<45} {'fs(ms)':<12} {'auto(ms)':<12} {'Speedup':<10}") + print("-" * 80) + + for scenario, engines in results.items(): + fs_data = engines.get("fs") + auto_data = engines.get("auto") + if fs_data and auto_data and fs_data[0] > 0: + fs_ms = fs_data[0] + auto_ms = auto_data[0] + speedup = f"{fs_ms / auto_ms:.1f}x" + print(f"{scenario:<45} {fs_ms:<12.1f} {auto_ms:<12.1f} {speedup:<10}") + elif fs_data: + fs_ms = fs_data[0] + print(f"{scenario:<45} {fs_ms:<12.1f} {'N/A':<12} {'N/A':<10}") + elif auto_data: + auto_ms = auto_data[0] + print(f"{scenario:<45} {'N/A':<12} {auto_ms:<12.1f} {'N/A':<10}") + print() From 7be769765eb326661cdf87b0213ded77efc7310d Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Mon, 25 May 2026 19:54:05 +0800 Subject: [PATCH 08/31] refactor: code format by ruff --- .../grep/vikingdb_bm25/step1_generate.py | 59 ++++++--- .../vikingdb_bm25/step2_quick_add_resource.py | 1 + .../grep/vikingdb_bm25/step3_build_index.py | 17 ++- .../grep/vikingdb_bm25/step4_benchmark.py | 89 ++++++++++---- openviking/server/routers/search.py | 12 +- openviking/service/fs_service.py | 8 +- openviking/storage/collection_schemas.py | 4 +- .../storage/vectordb/collection/collection.py | 3 +- .../storage/vectordb/utils/validation.py | 4 +- openviking/storage/viking_fs.py | 113 ++++++++++++------ openviking/sync_client.py | 10 +- openviking_cli/client/sync_http.py | 11 +- 12 files changed, 237 insertions(+), 94 deletions(-) diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step1_generate.py b/benchmark/retrieval/grep/vikingdb_bm25/step1_generate.py index 92dd4bd79b..d69717add6 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/step1_generate.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/step1_generate.py @@ -12,6 +12,7 @@ Target keywords appear in ~1% of files each, simulating a realistic large-scale codebase where bm25 recall dramatically reduces search scope. """ + import os import random @@ -19,10 +20,10 @@ # Directory tree — each level has independent dir count LEVEL0_DIRS = 10 -LEVEL1_DIRS = 10 # per level0 dir -LEVEL2_DIRS = 10 # per level1 dir -LEVEL3_DIRS = 8 # per level2 dir (flexible) -FILES_PER_DIR = 10 # per level3 dir +LEVEL1_DIRS = 10 # per level0 dir +LEVEL2_DIRS = 10 # per level1 dir +LEVEL3_DIRS = 8 # per level2 dir (flexible) +FILES_PER_DIR = 10 # per level3 dir # Total: 10 * 10 * 10 * 8 * 10 = 80,000 files # Size: 80,000 * 50KB ≈ 4GB @@ -33,14 +34,38 @@ TARGET_KEYWORDS = ["VikingDB", "FullText", "bm25", "search_by_keywords"] FILLER_WORDS = [ - "configuration", "deployment", "architecture", "implementation", - "performance", "optimization", "integration", "middleware", - "authentication", "authorization", "encryption", "validation", - "monitoring", "logging", "caching", "serialization", - "concurrency", "scalability", "reliability", "observability", - "throughput", "latency", "availability", "consistency", - "partitioning", "replication", "failover", "loadbalancing", - "containerization", "orchestration", "provisioning", "lifecycle", + "configuration", + "deployment", + "architecture", + "implementation", + "performance", + "optimization", + "integration", + "middleware", + "authentication", + "authorization", + "encryption", + "validation", + "monitoring", + "logging", + "caching", + "serialization", + "concurrency", + "scalability", + "reliability", + "observability", + "throughput", + "latency", + "availability", + "consistency", + "partitioning", + "replication", + "failover", + "loadbalancing", + "containerization", + "orchestration", + "provisioning", + "lifecycle", ] random.seed(42) @@ -103,11 +128,15 @@ def generate_file(file_idx): print(f"Generating {total_files} markdown files under {BASE_DIR}...") -print(f" Tree: level0={LEVEL0_DIRS} x level1={LEVEL1_DIRS} x level2={LEVEL2_DIRS} x level3={LEVEL3_DIRS}") +print( + f" Tree: level0={LEVEL0_DIRS} x level1={LEVEL1_DIRS} x level2={LEVEL2_DIRS} x level3={LEVEL3_DIRS}" +) print(f" Files per leaf dir: {FILES_PER_DIR}") print(f" Target keywords: {TARGET_KEYWORDS}") -print(f" Each keyword appears in ~{keyword_hit_count} files out of {total_files} " - f"(~{keyword_hit_count / total_files * 100:.1f}%)") +print( + f" Each keyword appears in ~{keyword_hit_count} files out of {total_files} " + f"(~{keyword_hit_count / total_files * 100:.1f}%)" +) print(f" Estimated total size: ~{total_files * TARGET_FILE_SIZE / 1e9:.1f} GB") file_idx = 0 diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step2_quick_add_resource.py b/benchmark/retrieval/grep/vikingdb_bm25/step2_quick_add_resource.py index 9f054bff20..6fc1c2e89d 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/step2_quick_add_resource.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/step2_quick_add_resource.py @@ -14,6 +14,7 @@ Usage: python3 step2_quick_add_resource.py [--no-resume] [--max-failures N] """ + import argparse import os import sys diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step3_build_index.py b/benchmark/retrieval/grep/vikingdb_bm25/step3_build_index.py index 3daf691208..22d229acd4 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/step3_build_index.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/step3_build_index.py @@ -11,6 +11,7 @@ Usage: python3 step3_build_index.py [--no-resume] [--mode MODE] [--max-failures N] """ + import argparse import os import shlex @@ -123,7 +124,19 @@ def main(): continue uri = f"{BENCHMARK_URI}/{rel_dir}" - cmd = ["ov", "reindex", "--account", "default", "--user", "default", "--mode", args.mode, "--wait", "true", uri] + cmd = [ + "ov", + "reindex", + "--account", + "default", + "--user", + "default", + "--mode", + args.mode, + "--wait", + "true", + uri, + ] idx = count + skipped + 1 cmd_str = shlex.join(cmd) print(f"[{idx}/{total}] $ {cmd_str}") @@ -145,7 +158,7 @@ def main(): print(f" OK ({elapsed:.1f}s)") save_progress(rel_dir) except subprocess.TimeoutExpired: - print(f" TIMEOUT (600s)") + print(" TIMEOUT (600s)") failed += 1 except Exception as e: print(f" ERROR: {e}") diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step4_benchmark.py b/benchmark/retrieval/grep/vikingdb_bm25/step4_benchmark.py index 65ef1d2590..9ad3fab38e 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/step4_benchmark.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/step4_benchmark.py @@ -11,10 +11,10 @@ Outputs a comparison table of elapsed time and match count for each query. """ + import argparse import shlex import subprocess -import sys import time BASE_URI = "viking://resources/benchmark" @@ -26,37 +26,78 @@ # --- Single keyword --- ("fs: single keyword (VikingDB)", "VikingDB", "fs", []), ("bm25: single keyword (VikingDB)", "VikingDB", "auto", ["--switch-to-remote-threshold", "0"]), - ("fs: single keyword (FullText)", "FullText", "fs", []), ("bm25: single keyword (FullText)", "FullText", "auto", ["--switch-to-remote-threshold", "0"]), - # --- Multi-keyword (regex alternation) --- ("fs: 2 keywords (VikingDB|FullText)", "VikingDB|FullText", "fs", []), - ("bm25: 2 keywords (VikingDB|FullText)", "VikingDB|FullText", "auto", ["--switch-to-remote-threshold", "0"]), - + ( + "bm25: 2 keywords (VikingDB|FullText)", + "VikingDB|FullText", + "auto", + ["--switch-to-remote-threshold", "0"], + ), ("fs: 3 keywords (VikingDB|FullText|bm25)", "VikingDB|FullText|bm25", "fs", []), - ("bm25: 3 keywords (VikingDB|FullText|bm25)", "VikingDB|FullText|bm25", "auto", ["--switch-to-remote-threshold", "0"]), - + ( + "bm25: 3 keywords (VikingDB|FullText|bm25)", + "VikingDB|FullText|bm25", + "auto", + ["--switch-to-remote-threshold", "0"], + ), # --- Rare keyword (lower hit count) --- ("fs: rare keyword (search_by_keywords)", "search_by_keywords", "fs", []), - ("bm25: rare keyword (search_by_keywords)", "search_by_keywords", "auto", ["--switch-to-remote-threshold", "0"]), - + ( + "bm25: rare keyword (search_by_keywords)", + "search_by_keywords", + "auto", + ["--switch-to-remote-threshold", "0"], + ), # --- Non-existent keyword (0 matches) --- ("fs: no-match 1 keyword (zzz_nonexistent)", "zzz_nonexistent", "fs", []), - ("bm25: no-match 1 keyword (zzz_nonexistent)", "zzz_nonexistent", "auto", ["--switch-to-remote-threshold", "0"]), - + ( + "bm25: no-match 1 keyword (zzz_nonexistent)", + "zzz_nonexistent", + "auto", + ["--switch-to-remote-threshold", "0"], + ), ("fs: no-match 2 keywords (zzz_a|zzz_b)", "zzz_a|zzz_b", "fs", []), - ("bm25: no-match 2 keywords (zzz_a|zzz_b)", "zzz_a|zzz_b", "auto", ["--switch-to-remote-threshold", "0"]), - + ( + "bm25: no-match 2 keywords (zzz_a|zzz_b)", + "zzz_a|zzz_b", + "auto", + ["--switch-to-remote-threshold", "0"], + ), ("fs: no-match 3 keywords (zzz_a|zzz_b|zzz_c)", "zzz_a|zzz_b|zzz_c", "fs", []), - ("bm25: no-match 3 keywords (zzz_a|zzz_b|zzz_c)", "zzz_a|zzz_b|zzz_c", "auto", ["--switch-to-remote-threshold", "0"]), - + ( + "bm25: no-match 3 keywords (zzz_a|zzz_b|zzz_c)", + "zzz_a|zzz_b|zzz_c", + "auto", + ["--switch-to-remote-threshold", "0"], + ), # --- Subdirectory scope (~8K files per level0 dir) --- - ("fs: subdir level0_00, VikingDB (~8K files)", "VikingDB", "fs", ["--uri", f"{BASE_URI}/level0_00"]), - ("bm25: subdir level0_00, VikingDB (~8K files)", "VikingDB", "auto", ["--uri", f"{BASE_URI}/level0_00", "--switch-to-remote-threshold", "0"]), - - ("fs: subdir level0_00, no-match (~8K files)", "zzz_nonexistent", "fs", ["--uri", f"{BASE_URI}/level0_00"]), - ("bm25: subdir level0_00, no-match (~8K files)", "zzz_nonexistent", "auto", ["--uri", f"{BASE_URI}/level0_00", "--switch-to-remote-threshold", "0"]), + ( + "fs: subdir level0_00, VikingDB (~8K files)", + "VikingDB", + "fs", + ["--uri", f"{BASE_URI}/level0_00"], + ), + ( + "bm25: subdir level0_00, VikingDB (~8K files)", + "VikingDB", + "auto", + ["--uri", f"{BASE_URI}/level0_00", "--switch-to-remote-threshold", "0"], + ), + ( + "fs: subdir level0_00, no-match (~8K files)", + "zzz_nonexistent", + "fs", + ["--uri", f"{BASE_URI}/level0_00"], + ), + ( + "bm25: subdir level0_00, no-match (~8K files)", + "zzz_nonexistent", + "auto", + ["--uri", f"{BASE_URI}/level0_00", "--switch-to-remote-threshold", "0"], + ), ] @@ -88,8 +129,12 @@ def run_grep(pattern: str, engine: str, extra_args: list) -> tuple[float, int, s def main(): parser = argparse.ArgumentParser(description="Benchmark grep: fs vs bm25") - parser.add_argument("--runs", type=int, default=3, help="Number of runs per test case (default: 3)") - parser.add_argument("--warmup", type=int, default=1, help="Warmup runs before measuring (default: 1)") + parser.add_argument( + "--runs", type=int, default=3, help="Number of runs per test case (default: 3)" + ) + parser.add_argument( + "--warmup", type=int, default=1, help="Warmup runs before measuring (default: 1)" + ) args = parser.parse_args() print(f"{'Label':<50} {'Engine':<8} {'Avg(ms)':<10} {'Min(ms)':<10} {'Max(ms)':<10}") diff --git a/openviking/server/routers/search.py b/openviking/server/routers/search.py index d559ab0163..6e2783ba69 100644 --- a/openviking/server/routers/search.py +++ b/openviking/server/routers/search.py @@ -11,12 +11,12 @@ from openviking.core.path_variables import resolve_path_variables from openviking.pyagfs.exceptions import AGFSClientError, AGFSNotFoundError from openviking.server.auth import get_request_context -from openviking.storage.viking_fs import GrepEngine from openviking.server.dependencies import get_service from openviking.server.error_mapping import map_exception from openviking.server.identity import RequestContext from openviking.server.models import Response from openviking.server.telemetry import run_operation +from openviking.storage.viking_fs import GrepEngine from openviking.telemetry import TelemetryRequest from openviking.utils.search_filters import _resolve_levels, merge_time_filter from openviking_cli.exceptions import InvalidArgumentError, NotFoundError @@ -113,8 +113,14 @@ class GrepRequest(BaseModel): node_limit: Optional[int] = None level_limit: int = 5 engine: GrepEngine = "auto" - switch_to_remote_threshold: int = Field(default=1000, ge=0, description="L2 record count threshold to switch to vikingdb; 0 means always use vikingdb") - remote_return_limit: int = Field(default=100, ge=1, le=100000, description="Maximum files recalled by vikingdb bm25") + switch_to_remote_threshold: int = Field( + default=1000, + ge=0, + description="L2 record count threshold to switch to vikingdb; 0 means always use vikingdb", + ) + remote_return_limit: int = Field( + default=100, ge=1, le=100000, description="Maximum files recalled by vikingdb bm25" + ) class GlobRequest(BaseModel): diff --git a/openviking/service/fs_service.py b/openviking/service/fs_service.py index 15fd2618f8..64577e8548 100644 --- a/openviking/service/fs_service.py +++ b/openviking/service/fs_service.py @@ -8,8 +8,6 @@ from typing import Any, Dict, List, Optional -from openviking.storage.viking_fs import GrepEngine - from openviking.core.namespace import context_type_for_uri from openviking.core.uri_validation import validate_optional_viking_uri, validate_viking_uri from openviking.privacy import ( @@ -19,7 +17,7 @@ ) from openviking.server.identity import RequestContext from openviking.storage.content_write import ContentWriteCoordinator -from openviking.storage.viking_fs import VikingFS +from openviking.storage.viking_fs import GrepEngine, VikingFS from openviking.utils.embedding_utils import vectorize_directory_meta from openviking_cli.exceptions import NotInitializedError from openviking_cli.utils import VikingURI, get_logger @@ -159,7 +157,9 @@ def _resolve_directory_uris(uri: str) -> tuple[str, str]: directory_uri = VikingURI(abstract_uri).parent.uri return directory_uri, abstract_uri - async def rm(self, uri: str, ctx: RequestContext, recursive: bool = False) -> Optional[Dict[str, Any]]: + async def rm( + self, uri: str, ctx: RequestContext, recursive: bool = False + ) -> Optional[Dict[str, Any]]: """Remove resource.""" uri = validate_viking_uri(uri) viking_fs = self._ensure_initialized() diff --git a/openviking/storage/collection_schemas.py b/openviking/storage/collection_schemas.py index 5fdd4ba8aa..118bd72548 100644 --- a/openviking/storage/collection_schemas.py +++ b/openviking/storage/collection_schemas.py @@ -286,7 +286,9 @@ async def init_context_collection(storage) -> bool: existing_schema_version = existing_embedding_meta.get("schema_version", "0.0.0") if _parse_version(existing_schema_version) < _parse_version(_FULLTEXT_MIN_VERSION): fields = existing_meta.get("Fields", []) - has_content = any(f.get("FieldName") == "content" and f.get("FieldType") == "text" for f in fields) + has_content = any( + f.get("FieldName") == "content" and f.get("FieldType") == "text" for f in fields + ) fulltext = existing_meta.get("FullText") or [] has_content_fulltext = any(ft.get("Field") == "content" for ft in fulltext) if not (has_content and has_content_fulltext): diff --git a/openviking/storage/vectordb/collection/collection.py b/openviking/storage/vectordb/collection/collection.py index c42b90fc79..ed25ce04b7 100644 --- a/openviking/storage/vectordb/collection/collection.py +++ b/openviking/storage/vectordb/collection/collection.py @@ -377,8 +377,7 @@ def search_by_keywords( if self.__collection is None: raise RuntimeError("Collection is closed") return self.__collection.search_by_keywords( - index_name, keywords, query, limit, offset, filters, output_fields, - mode, fields + index_name, keywords, query, limit, offset, filters, output_fields, mode, fields ) def search_by_id( diff --git a/openviking/storage/vectordb/utils/validation.py b/openviking/storage/vectordb/utils/validation.py index 378756d378..90864eac73 100644 --- a/openviking/storage/vectordb/utils/validation.py +++ b/openviking/storage/vectordb/utils/validation.py @@ -152,7 +152,9 @@ class CollectionMetaConfig(BaseModel): ProjectName: Optional[str] = None Description: Optional[str] = Field(None, max_length=65535) Vectorize: Optional[VectorizeConfig] = None - FullText: Optional[List[dict]] = None # e.g. [{"Field": "content", "Analyzer": {"Tokenizer": "standard"}}] + FullText: Optional[List[dict]] = ( + None # e.g. [{"Field": "content", "Analyzer": {"Tokenizer": "standard"}}] + ) # Internal fields _FieldsCount: Optional[int] = None diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index eafe263890..f52389a688 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -752,8 +752,9 @@ async def grep( ctx=ctx, ) - async def _resolve_grep_engine(self, engine: GrepEngine, uri: str, ctx, - switch_to_remote_threshold: int = 1000) -> str: + async def _resolve_grep_engine( + self, engine: GrepEngine, uri: str, ctx, switch_to_remote_threshold: int = 1000 + ) -> str: """Resolve the actual grep engine to use.""" if engine == "fs": return "fs" @@ -781,7 +782,9 @@ async def _resolve_grep_engine(self, engine: GrepEngine, uri: str, ctx, if count < switch_to_remote_threshold: return "fs" except Exception: - logger.debug("grep engine=auto: count() check failed, falling back to fs", exc_info=True) + logger.debug( + "grep engine=auto: count() check failed, falling back to fs", exc_info=True + ) return "fs" return "vikingdb_then_fs" @@ -796,16 +799,15 @@ async def _collection_has_fulltext(self, vector_store, ctx) -> bool: return False fields = meta.get("Fields", []) has_content = any( - f.get("FieldName") == "content" and f.get("FieldType") == "text" - for f in fields + f.get("FieldName") == "content" and f.get("FieldType") == "text" for f in fields ) fulltext = meta.get("FullText") or [] - has_content_fulltext = any( - ft.get("Field") == "content" for ft in fulltext - ) + has_content_fulltext = any(ft.get("Field") == "content" for ft in fulltext) return has_content and has_content_fulltext except Exception: - logger.debug("Failed to check collection fulltext config, assuming no fulltext", exc_info=True) + logger.debug( + "Failed to check collection fulltext config, assuming no fulltext", exc_info=True + ) return False async def _get_cached_count(self, uri: str, ctx) -> int: @@ -828,38 +830,59 @@ async def _get_cached_count(self, uri: str, ctx) -> int: # Evict oldest entries if cache exceeds max size if len(self._count_cache) >= self._count_cache_max_size: oldest_keys = sorted(self._count_cache, key=lambda k: self._count_cache[k][1]) - for k in oldest_keys[:len(oldest_keys) // 2]: + for k in oldest_keys[: len(oldest_keys) // 2]: del self._count_cache[k] self._count_cache[cache_key] = (count, now) return count - async def _grep_fs(self, uri, pattern, exclude_uri, case_insensitive, - node_limit, level_limit, ctx): + async def _grep_fs( + self, uri, pattern, exclude_uri, case_insensitive, node_limit, level_limit, ctx + ): """Existing fs path (renamed from original inline logic).""" if self._encryptor: return await self._grep_encrypted( - uri=uri, pattern=pattern, exclude_uri=exclude_uri, - case_insensitive=case_insensitive, node_limit=node_limit, - level_limit=level_limit, ctx=ctx, + uri=uri, + pattern=pattern, + exclude_uri=exclude_uri, + case_insensitive=case_insensitive, + node_limit=node_limit, + level_limit=level_limit, + ctx=ctx, ) try: return await self._grep_with_agfs( - uri=uri, pattern=pattern, exclude_uri=exclude_uri, - case_insensitive=case_insensitive, node_limit=node_limit, - level_limit=level_limit, ctx=ctx, + uri=uri, + pattern=pattern, + exclude_uri=exclude_uri, + case_insensitive=case_insensitive, + node_limit=node_limit, + level_limit=level_limit, + ctx=ctx, ) except (AttributeError, AGFSNotSupportedError, NotImplementedError) as e: logger.debug(f"agfs grep unavailable, falling back to VikingFS implementation: {e}") return await self._grep_encrypted( - uri=uri, pattern=pattern, exclude_uri=exclude_uri, - case_insensitive=case_insensitive, node_limit=node_limit, - level_limit=level_limit, ctx=ctx, + uri=uri, + pattern=pattern, + exclude_uri=exclude_uri, + case_insensitive=case_insensitive, + node_limit=node_limit, + level_limit=level_limit, + ctx=ctx, ) - async def _grep_vikingdb_then_fs(self, uri, pattern, exclude_uri, - case_insensitive, node_limit, - level_limit, remote_return_limit, ctx): + async def _grep_vikingdb_then_fs( + self, + uri, + pattern, + exclude_uri, + case_insensitive, + node_limit, + level_limit, + remote_return_limit, + ctx, + ): """VikingDB bm25 recall + local fs precise matching.""" vector_store = self._get_vector_store() @@ -868,10 +891,12 @@ async def _grep_vikingdb_then_fs(self, uri, pattern, exclude_uri, # Split regex alternation (e.g. "error|warning|fail") into individual keywords # for bm25 search. Limit to 10 keywords per VikingDB API constraint. keywords = [kw.strip() for kw in pattern.split("|") if kw.strip()][:10] - filter_expr = And([ - PathScope("uri", uri), - Eq("level", 2), - ]) + filter_expr = And( + [ + PathScope("uri", uri), + Eq("level", 2), + ] + ) result = await vector_store.search_by_keywords( keywords=keywords, mode="bm25", @@ -884,15 +909,16 @@ async def _grep_vikingdb_then_fs(self, uri, pattern, exclude_uri, except Exception as e: logger.warning(f"grep vikingdb step failed, falling back to fs: {e}") return await self._grep_fs( - uri=uri, pattern=pattern, exclude_uri=exclude_uri, - case_insensitive=case_insensitive, node_limit=node_limit, - level_limit=level_limit, ctx=ctx, + uri=uri, + pattern=pattern, + exclude_uri=exclude_uri, + case_insensitive=case_insensitive, + node_limit=node_limit, + level_limit=level_limit, + ctx=ctx, ) - candidate_uris = [ - r["uri"] for r in result - if r.get("uri") - ] + candidate_uris = [r["uri"] for r in result if r.get("uri")] if exclude_uri: candidate_uris = [u for u in candidate_uris if not u.startswith(exclude_uri)] if not candidate_uris: @@ -901,12 +927,21 @@ async def _grep_vikingdb_then_fs(self, uri, pattern, exclude_uri, # Step 2: local fs precise matching on candidate files return await self._grep_in_files( - candidate_uris, pattern, case_insensitive, node_limit, ctx, + candidate_uris, + pattern, + case_insensitive, + node_limit, + ctx, ) - async def _grep_in_files(self, file_uris: List[str], pattern: str, - case_insensitive: bool, node_limit: Optional[int], - ctx: Optional[RequestContext]) -> Dict: + async def _grep_in_files( + self, + file_uris: List[str], + pattern: str, + case_insensitive: bool, + node_limit: Optional[int], + ctx: Optional[RequestContext], + ) -> Dict: """Execute regex matching in specified file list (vikingdb_then_fs Step 2).""" flags = re.IGNORECASE if case_insensitive else 0 compiled = re.compile(pattern, flags) diff --git a/openviking/sync_client.py b/openviking/sync_client.py index d70b16ed75..9140da869d 100644 --- a/openviking/sync_client.py +++ b/openviking/sync_client.py @@ -375,8 +375,14 @@ def grep( """Content search""" return run_async( self._async_client.grep( - uri, pattern, case_insensitive, node_limit, exclude_uri, - engine, switch_to_remote_threshold, remote_return_limit, + uri, + pattern, + case_insensitive, + node_limit, + exclude_uri, + engine, + switch_to_remote_threshold, + remote_return_limit, ) ) diff --git a/openviking_cli/client/sync_http.py b/openviking_cli/client/sync_http.py index 445c6f8f57..52f82edda4 100644 --- a/openviking_cli/client/sync_http.py +++ b/openviking_cli/client/sync_http.py @@ -8,7 +8,6 @@ from typing import Any, Dict, List, Optional, Union from openviking.storage.viking_fs import GrepEngine - from openviking.telemetry import TelemetryRequest from openviking_cli.client.http import AsyncHTTPClient from openviking_cli.utils import run_async @@ -322,8 +321,14 @@ def grep( """Content search with pattern.""" return run_async( self._async_client.grep( - uri, pattern, case_insensitive, node_limit, exclude_uri, - engine, switch_to_remote_threshold, remote_return_limit, + uri, + pattern, + case_insensitive, + node_limit, + exclude_uri, + engine, + switch_to_remote_threshold, + remote_return_limit, ) ) From 261aaf2ed95658f8ddcc20382ad53c25797b0b8f Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Tue, 26 May 2026 14:36:40 +0800 Subject: [PATCH 09/31] optimize: move grep config (engine and switch_to_remote_threshold) to ov.conf --- .../grep/vikingdb_bm25/step4_benchmark.py | 146 ++++-------------- crates/ov_cli/src/client.rs | 8 - crates/ov_cli/src/commands/search.rs | 4 - crates/ov_cli/src/handlers.rs | 4 - crates/ov_cli/src/main.rs | 14 +- docs/en/guides/01-configuration.md | 22 ++- docs/zh/guides/01-configuration.md | 20 +++ examples/ov.conf.example | 1 + openviking/async_client.py | 10 +- openviking/client/local.py | 8 +- openviking/server/routers/search.py | 14 +- openviking/service/core.py | 1 + openviking/service/fs_service.py | 8 +- openviking/storage/viking_fs.py | 65 +++++--- openviking/sync_client.py | 10 +- openviking_cli/client/base.py | 7 +- openviking_cli/client/http.py | 10 +- openviking_cli/client/sync_http.py | 9 +- openviking_cli/utils/config/__init__.py | 3 + openviking_cli/utils/config/grep_config.py | 31 ++++ .../utils/config/open_viking_config.py | 6 + 21 files changed, 173 insertions(+), 228 deletions(-) create mode 100644 openviking_cli/utils/config/grep_config.py diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step4_benchmark.py b/benchmark/retrieval/grep/vikingdb_bm25/step4_benchmark.py index 9ad3fab38e..1f14e85e1e 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/step4_benchmark.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/step4_benchmark.py @@ -6,6 +6,12 @@ 2. Run step2_quick_add_resource.py to upload files (skip VLM+embedding) 3. Run step3_build_index.py to build index (embedding) +NOTE: `engine` and `switch_to_remote_threshold` are now server-side config +(ov.conf `[grep]` section). To benchmark different engines, update ov.conf +and restart the server before each run. The default config uses engine="auto" +with switch_to_remote_threshold=1000; set switch_to_remote_threshold=0 to +force VikingDB bm25 recall. + Usage: python3 step4_benchmark.py [--runs N] [--warmup N] @@ -20,84 +26,28 @@ BASE_URI = "viking://resources/benchmark" OV_CMD = ["ov", "--account", "default", "--user", "default"] -# Test cases: (label, pattern, engine, extra_args) +# Test cases: (label, pattern, extra_args) # extra_args can override --uri; if present, the default --uri is omitted. TEST_CASES = [ # --- Single keyword --- - ("fs: single keyword (VikingDB)", "VikingDB", "fs", []), - ("bm25: single keyword (VikingDB)", "VikingDB", "auto", ["--switch-to-remote-threshold", "0"]), - ("fs: single keyword (FullText)", "FullText", "fs", []), - ("bm25: single keyword (FullText)", "FullText", "auto", ["--switch-to-remote-threshold", "0"]), + ("single keyword (VikingDB)", "VikingDB", []), + ("single keyword (FullText)", "FullText", []), # --- Multi-keyword (regex alternation) --- - ("fs: 2 keywords (VikingDB|FullText)", "VikingDB|FullText", "fs", []), - ( - "bm25: 2 keywords (VikingDB|FullText)", - "VikingDB|FullText", - "auto", - ["--switch-to-remote-threshold", "0"], - ), - ("fs: 3 keywords (VikingDB|FullText|bm25)", "VikingDB|FullText|bm25", "fs", []), - ( - "bm25: 3 keywords (VikingDB|FullText|bm25)", - "VikingDB|FullText|bm25", - "auto", - ["--switch-to-remote-threshold", "0"], - ), + ("2 keywords (VikingDB|FullText)", "VikingDB|FullText", []), + ("3 keywords (VikingDB|FullText|bm25)", "VikingDB|FullText|bm25", []), # --- Rare keyword (lower hit count) --- - ("fs: rare keyword (search_by_keywords)", "search_by_keywords", "fs", []), - ( - "bm25: rare keyword (search_by_keywords)", - "search_by_keywords", - "auto", - ["--switch-to-remote-threshold", "0"], - ), + ("rare keyword (search_by_keywords)", "search_by_keywords", []), # --- Non-existent keyword (0 matches) --- - ("fs: no-match 1 keyword (zzz_nonexistent)", "zzz_nonexistent", "fs", []), - ( - "bm25: no-match 1 keyword (zzz_nonexistent)", - "zzz_nonexistent", - "auto", - ["--switch-to-remote-threshold", "0"], - ), - ("fs: no-match 2 keywords (zzz_a|zzz_b)", "zzz_a|zzz_b", "fs", []), - ( - "bm25: no-match 2 keywords (zzz_a|zzz_b)", - "zzz_a|zzz_b", - "auto", - ["--switch-to-remote-threshold", "0"], - ), - ("fs: no-match 3 keywords (zzz_a|zzz_b|zzz_c)", "zzz_a|zzz_b|zzz_c", "fs", []), - ( - "bm25: no-match 3 keywords (zzz_a|zzz_b|zzz_c)", - "zzz_a|zzz_b|zzz_c", - "auto", - ["--switch-to-remote-threshold", "0"], - ), + ("no-match 1 keyword (zzz_nonexistent)", "zzz_nonexistent", []), + ("no-match 2 keywords (zzz_a|zzz_b)", "zzz_a|zzz_b", []), + ("no-match 3 keywords (zzz_a|zzz_b|zzz_c)", "zzz_a|zzz_b|zzz_c", []), # --- Subdirectory scope (~8K files per level0 dir) --- + ("subdir level0_00, VikingDB (~8K files)", "VikingDB", ["--uri", f"{BASE_URI}/level0_00"]), ( - "fs: subdir level0_00, VikingDB (~8K files)", - "VikingDB", - "fs", - ["--uri", f"{BASE_URI}/level0_00"], - ), - ( - "bm25: subdir level0_00, VikingDB (~8K files)", - "VikingDB", - "auto", - ["--uri", f"{BASE_URI}/level0_00", "--switch-to-remote-threshold", "0"], - ), - ( - "fs: subdir level0_00, no-match (~8K files)", + "subdir level0_00, no-match (~8K files)", "zzz_nonexistent", - "fs", ["--uri", f"{BASE_URI}/level0_00"], ), - ( - "bm25: subdir level0_00, no-match (~8K files)", - "zzz_nonexistent", - "auto", - ["--uri", f"{BASE_URI}/level0_00", "--switch-to-remote-threshold", "0"], - ), ] @@ -106,12 +56,12 @@ def _has_uri_arg(extra_args: list) -> bool: return "--uri" in extra_args -def run_grep(pattern: str, engine: str, extra_args: list) -> tuple[float, int, str, str]: +def run_grep(pattern: str, extra_args: list) -> tuple[float, int, str, str]: """Run a single grep command, return (elapsed_seconds, match_count, stdout, stderr).""" cmd = OV_CMD + ["grep"] if not _has_uri_arg(extra_args): cmd += ["--uri", BASE_URI] - cmd += ["--engine", engine] + extra_args + [pattern] + cmd += extra_args + [pattern] cmd_str = shlex.join(cmd) print(f" $ {cmd_str}") @@ -137,53 +87,41 @@ def main(): ) args = parser.parse_args() - print(f"{'Label':<50} {'Engine':<8} {'Avg(ms)':<10} {'Min(ms)':<10} {'Max(ms)':<10}") - print("-" * 98) - - # Collect results for summary report: key = scenario name, value = {engine: (avg_ms, match_count)} - results = {} - - for label, pattern, engine, extra_args in TEST_CASES: - # Resolve URI from extra_args if overridden - uri = BASE_URI - for i, a in enumerate(extra_args): - if a == "--uri" and i + 1 < len(extra_args): - uri = extra_args[i + 1] + print(f"{'Label':<50} {'Avg(ms)':<10} {'Min(ms)':<10} {'Max(ms)':<10}") + print("-" * 88) + for label, pattern, extra_args in TEST_CASES: # Warmup runs for _ in range(args.warmup): try: - run_grep(pattern, engine, extra_args) + run_grep(pattern, extra_args) except Exception: break # Measured runs times = [] - match_count = -1 last_stdout = "" last_stderr = "" failed = False for _ in range(args.runs): try: - elapsed, matches, stdout, stderr = run_grep(pattern, engine, extra_args) + elapsed, matches, stdout, stderr = run_grep(pattern, extra_args) times.append(elapsed) - match_count = matches last_stdout = stdout last_stderr = stderr except Exception: failed = True break - avg_ms = 0.0 if failed: - print(f"{label:<50} {engine:<8} FAILED") + print(f"{label:<50} FAILED") elif not times: - print(f"{label:<50} {engine:<8} NO DATA") + print(f"{label:<50} NO DATA") else: avg_ms = sum(times) / len(times) * 1000 min_ms = min(times) * 1000 max_ms = max(times) * 1000 - print(f"{label:<50} {engine:<8} {avg_ms:<10.1f} {min_ms:<10.1f} {max_ms:<10.1f}") + print(f"{label:<50} {avg_ms:<10.1f} {min_ms:<10.1f} {max_ms:<10.1f}") # Print output from last run (compact) if last_stdout.strip(): @@ -195,36 +133,6 @@ def main(): for line in last_stderr.strip().splitlines()[:2]: print(f" [stderr] {line}") - # Store result for summary - # Derive scenario name by stripping engine prefix: "fs: xxx" or "bm25: xxx" -> "xxx" - scenario = label.split(": ", 1)[1].strip() if ": " in label else label.strip() - if scenario not in results: - results[scenario] = {} - results[scenario][engine] = (avg_ms, match_count) - - # Print summary report - print() - print("=" * 80) - print("PERFORMANCE REPORT: fs vs bm25 (auto)") - print("=" * 80) - print(f"{'Scenario':<45} {'fs(ms)':<12} {'auto(ms)':<12} {'Speedup':<10}") - print("-" * 80) - - for scenario, engines in results.items(): - fs_data = engines.get("fs") - auto_data = engines.get("auto") - if fs_data and auto_data and fs_data[0] > 0: - fs_ms = fs_data[0] - auto_ms = auto_data[0] - speedup = f"{fs_ms / auto_ms:.1f}x" - print(f"{scenario:<45} {fs_ms:<12.1f} {auto_ms:<12.1f} {speedup:<10}") - elif fs_data: - fs_ms = fs_data[0] - print(f"{scenario:<45} {fs_ms:<12.1f} {'N/A':<12} {'N/A':<10}") - elif auto_data: - auto_ms = auto_data[0] - print(f"{scenario:<45} {'N/A':<12} {auto_ms:<12.1f} {'N/A':<10}") - print() diff --git a/crates/ov_cli/src/client.rs b/crates/ov_cli/src/client.rs index e7e39841e7..bfe4015382 100644 --- a/crates/ov_cli/src/client.rs +++ b/crates/ov_cli/src/client.rs @@ -425,8 +425,6 @@ impl HttpClient { ignore_case: bool, node_limit: i32, level_limit: i32, - engine: Option, - switch_to_remote_threshold: Option, remote_return_limit: Option, ) -> Result { let mut body = serde_json::json!({ @@ -437,12 +435,6 @@ impl HttpClient { "node_limit": node_limit, "level_limit": level_limit, }); - if let Some(eng) = engine { - body["engine"] = serde_json::json!(eng); - } - if let Some(threshold) = switch_to_remote_threshold { - body["switch_to_remote_threshold"] = serde_json::json!(threshold); - } if let Some(limit) = remote_return_limit { body["remote_return_limit"] = serde_json::json!(limit); } diff --git a/crates/ov_cli/src/commands/search.rs b/crates/ov_cli/src/commands/search.rs index a4a934a05e..a581e3c2db 100644 --- a/crates/ov_cli/src/commands/search.rs +++ b/crates/ov_cli/src/commands/search.rs @@ -69,8 +69,6 @@ pub async fn grep( ignore_case: bool, node_limit: i32, level_limit: i32, - engine: Option, - switch_to_remote_threshold: Option, remote_return_limit: Option, output_format: OutputFormat, compact: bool, @@ -83,8 +81,6 @@ pub async fn grep( ignore_case, node_limit, level_limit, - engine, - switch_to_remote_threshold, remote_return_limit, ) .await?; diff --git a/crates/ov_cli/src/handlers.rs b/crates/ov_cli/src/handlers.rs index afaf254c60..0609991e30 100644 --- a/crates/ov_cli/src/handlers.rs +++ b/crates/ov_cli/src/handlers.rs @@ -1177,8 +1177,6 @@ pub async fn handle_grep( ignore_case: bool, node_limit: i32, level_limit: i32, - engine: Option, - switch_to_remote_threshold: Option, remote_return_limit: Option, ctx: CliContext, ) -> Result<()> { @@ -1218,8 +1216,6 @@ pub async fn handle_grep( ignore_case, node_limit, level_limit, - engine, - switch_to_remote_threshold, remote_return_limit, ctx.output_format, ctx.compact, diff --git a/crates/ov_cli/src/main.rs b/crates/ov_cli/src/main.rs index b6f44b1337..7a9ded1a8f 100644 --- a/crates/ov_cli/src/main.rs +++ b/crates/ov_cli/src/main.rs @@ -502,14 +502,8 @@ enum Commands { /// Maximum depth level to traverse (default: 10) #[arg(short = 'L', long = "level-limit", default_value = "10")] level_limit: i32, - /// Search engine mode: "auto" (default) or "fs" - #[arg(long = "engine", default_value = "auto")] - engine: Option, - /// L2 record count threshold to switch to vikingdb; 0 means always use vikingdb - #[arg(long = "switch-to-remote-threshold", default_value = "1000")] - switch_to_remote_threshold: Option, - /// Maximum files recalled by vikingdb bm25 (1-100000) - #[arg(long = "remote-return-limit", default_value = "100")] + /// Maximum files recalled by vikingdb bm25; 0 means auto-adapt (0-100000) + #[arg(long = "remote-return-limit", default_value = "0")] remote_return_limit: Option, }, /// [Data] Run file glob pattern search @@ -1498,8 +1492,6 @@ async fn main() { ignore_case, node_limit, level_limit, - engine, - switch_to_remote_threshold, remote_return_limit, } => { handlers::handle_grep( @@ -1509,8 +1501,6 @@ async fn main() { ignore_case, node_limit, level_limit, - engine, - switch_to_remote_threshold, remote_return_limit, ctx, ) diff --git a/docs/en/guides/01-configuration.md b/docs/en/guides/01-configuration.md index cdb50b7389..7b659d929e 100644 --- a/docs/en/guides/01-configuration.md +++ b/docs/en/guides/01-configuration.md @@ -844,6 +844,26 @@ Retrieval ranking configuration for final search scores. Keep `hotness_alpha` at `0.0` when you need scores to reflect pure vector similarity. Set it above `0.0` only when frequently accessed or recently updated contexts should receive a ranking boost. +### grep + +Grep engine configuration for content pattern search. These settings are server-side only and cannot be overridden per-request. + +```json +{ + "grep": { + "engine": "auto", + "switch_to_remote_threshold": 1000 + } +} +``` + +| Parameter | Type | Description | Default | +|-----------|------|-------------|---------| +| `engine` | str | Search engine mode: `"auto"` uses VikingDB BM25 recall when available and falls back to local filesystem search; `"fs"` forces local filesystem search only. | `"auto"` | +| `switch_to_remote_threshold` | int | L2 record count threshold to switch to VikingDB BM25 recall. When the number of L2 files under the search scope exceeds this threshold, VikingDB BM25 is used for phase-1 recall; otherwise local filesystem search is used. Set to `0` to always use VikingDB BM25. Must be ≥ 0. | `1000` | + +The per-request parameter `remote_return_limit` (0–100000, default `0` = auto-adapt to 100000) controls the maximum number of files recalled by VikingDB BM25 in a single grep call. + ### storage Storage configuration for context data, including file storage (RAGFS) and vector database storage (VectorDB). @@ -1109,7 +1129,7 @@ openviking-server --config /path/to/ov.conf ### ov.conf -The config sections documented above (embedding, vlm, rerank, storage) all belong to `ov.conf`. SDK embedded mode and server share this file. +The config sections documented above (embedding, vlm, rerank, retrieval, grep, storage) all belong to `ov.conf`. SDK embedded mode and server share this file. For memory-related settings, add a `memory` section in `ov.conf`: diff --git a/docs/zh/guides/01-configuration.md b/docs/zh/guides/01-configuration.md index 33cb3b65ce..c6a92b91a6 100644 --- a/docs/zh/guides/01-configuration.md +++ b/docs/zh/guides/01-configuration.md @@ -814,6 +814,26 @@ AST 提取支持:Python、JavaScript/TypeScript、Rust、Go、Java、C/C++。 如果需要分数严格反映向量相似度,保持 `hotness_alpha` 为 `0.0`。只有当希望高频访问或最近更新的上下文获得排序提升时,才将它设置为大于 `0.0`。 +### grep + +Grep 引擎配置,用于内容模式搜索。这些设置为服务端配置,不支持请求级别覆盖。 + +```json +{ + "grep": { + "engine": "auto", + "switch_to_remote_threshold": 1000 + } +} +``` + +| 参数 | 类型 | 说明 | 默认值 | +|------|------|------|--------| +| `engine` | str | 搜索引擎模式:`"auto"` 在可用时使用 VikingDB BM25 召回,不可用时回退到本地文件系统搜索;`"fs"` 强制仅使用本地文件系统搜索。 | `"auto"` | +| `switch_to_remote_threshold` | int | 切换到 VikingDB BM25 召回的 L2 记录数阈值。当搜索范围内的 L2 文件数超过此阈值时,使用 VikingDB BM25 进行第一阶段召回;否则使用本地文件系统搜索。设为 `0` 表示始终使用 VikingDB BM25。必须 ≥ 0。 | `1000` | + +请求级别参数 `remote_return_limit`(0–100000,默认 `0` = 自适应调整为 100000)控制单次 grep 调用中 VikingDB BM25 召回的最大文件数。 + ### storage 用于存储上下文数据 ,包括文件存储(RAGFS)和向量库存储(VectorDB)。 diff --git a/examples/ov.conf.example b/examples/ov.conf.example index 453af2a1c1..163bc7d12b 100644 --- a/examples/ov.conf.example +++ b/examples/ov.conf.example @@ -149,6 +149,7 @@ "threshold": 0.1, }, "retrieval": {"hotness_alpha": 0.0, "score_propagation_alpha": 1.0}, + "grep": {"engine": "auto", "switch_to_remote_threshold": 1000}, "auto_generate_l0": true, "auto_generate_l1": true, "default_search_mode": "thinking", diff --git a/openviking/async_client.py b/openviking/async_client.py index 81df165f26..8d124a2c3e 100644 --- a/openviking/async_client.py +++ b/openviking/async_client.py @@ -11,8 +11,6 @@ import threading from typing import Any, Dict, List, Optional, Union -from openviking.storage.viking_fs import GrepEngine - from openviking.client import LocalClient, Session from openviking.service.debug_service import SystemStatus from openviking.telemetry import TelemetryRequest @@ -477,9 +475,8 @@ async def grep( case_insensitive: bool = False, node_limit: Optional[int] = None, exclude_uri: Optional[str] = None, - engine: GrepEngine = "auto", - switch_to_remote_threshold: int = 1000, - remote_return_limit: int = 100, + level_limit: int = 5, + remote_return_limit: int = 0, ) -> Dict: """Content search""" await self._ensure_initialized() @@ -489,8 +486,7 @@ async def grep( case_insensitive=case_insensitive, node_limit=node_limit, exclude_uri=exclude_uri, - engine=engine, - switch_to_remote_threshold=switch_to_remote_threshold, + level_limit=level_limit, remote_return_limit=remote_return_limit, ) diff --git a/openviking/client/local.py b/openviking/client/local.py index 687f74d74f..191a7fc48a 100644 --- a/openviking/client/local.py +++ b/openviking/client/local.py @@ -7,8 +7,6 @@ from typing import Any, Dict, List, Optional, Union -from openviking.storage.viking_fs import GrepEngine - from openviking.server.identity import RequestContext, Role from openviking.service import OpenVikingService from openviking.telemetry import TelemetryRequest @@ -373,9 +371,7 @@ async def grep( node_limit: Optional[int] = None, exclude_uri: Optional[str] = None, level_limit: int = 5, - engine: GrepEngine = "auto", - switch_to_remote_threshold: int = 1000, - remote_return_limit: int = 100, + remote_return_limit: int = 0, ) -> Dict[str, Any]: """Content search with pattern.""" return await self._service.fs.grep( @@ -386,8 +382,6 @@ async def grep( node_limit=node_limit, exclude_uri=exclude_uri, level_limit=level_limit, - engine=engine, - switch_to_remote_threshold=switch_to_remote_threshold, remote_return_limit=remote_return_limit, ) diff --git a/openviking/server/routers/search.py b/openviking/server/routers/search.py index 6e2783ba69..c61d81e853 100644 --- a/openviking/server/routers/search.py +++ b/openviking/server/routers/search.py @@ -16,7 +16,6 @@ from openviking.server.identity import RequestContext from openviking.server.models import Response from openviking.server.telemetry import run_operation -from openviking.storage.viking_fs import GrepEngine from openviking.telemetry import TelemetryRequest from openviking.utils.search_filters import _resolve_levels, merge_time_filter from openviking_cli.exceptions import InvalidArgumentError, NotFoundError @@ -112,14 +111,11 @@ class GrepRequest(BaseModel): case_insensitive: bool = False node_limit: Optional[int] = None level_limit: int = 5 - engine: GrepEngine = "auto" - switch_to_remote_threshold: int = Field( - default=1000, - ge=0, - description="L2 record count threshold to switch to vikingdb; 0 means always use vikingdb", - ) remote_return_limit: int = Field( - default=100, ge=1, le=100000, description="Maximum files recalled by vikingdb bm25" + default=0, + ge=0, + le=100000, + description="Maximum files recalled by vikingdb bm25; 0 means auto-adapt", ) @@ -238,8 +234,6 @@ async def grep( case_insensitive=request.case_insensitive, node_limit=request.node_limit, level_limit=request.level_limit, - engine=request.engine, - switch_to_remote_threshold=request.switch_to_remote_threshold, remote_return_limit=request.remote_return_limit, ) except AGFSNotFoundError: diff --git a/openviking/service/core.py b/openviking/service/core.py index 7edd3ecd7c..e329e3e644 100644 --- a/openviking/service/core.py +++ b/openviking/service/core.py @@ -285,6 +285,7 @@ async def initialize(self) -> None: rerank_config=config.rerank, vector_store=self._vikingdb_manager, retrieval_config=config.retrieval, + grep_config=config.grep, enable_recorder=enable_recorder, encryptor=self._encryptor, ) diff --git a/openviking/service/fs_service.py b/openviking/service/fs_service.py index 64577e8548..b42de2b3b1 100644 --- a/openviking/service/fs_service.py +++ b/openviking/service/fs_service.py @@ -17,7 +17,7 @@ ) from openviking.server.identity import RequestContext from openviking.storage.content_write import ContentWriteCoordinator -from openviking.storage.viking_fs import GrepEngine, VikingFS +from openviking.storage.viking_fs import VikingFS from openviking.utils.embedding_utils import vectorize_directory_meta from openviking_cli.exceptions import NotInitializedError from openviking_cli.utils import VikingURI, get_logger @@ -243,9 +243,7 @@ async def grep( case_insensitive: bool = False, node_limit: Optional[int] = None, level_limit: int = 5, - engine: GrepEngine = "auto", - switch_to_remote_threshold: int = 1000, - remote_return_limit: int = 100, + remote_return_limit: int = 0, ) -> Dict: """Content search.""" viking_fs = self._ensure_initialized() @@ -258,8 +256,6 @@ async def grep( case_insensitive=case_insensitive, node_limit=node_limit, level_limit=level_limit, - engine=engine, - switch_to_remote_threshold=switch_to_remote_threshold, remote_return_limit=remote_return_limit, ctx=ctx, ) diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index f52389a688..54a6e59452 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -22,10 +22,7 @@ from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import PurePath -from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union - -# Grep engine mode type alias — import this instead of repeating Literal["auto", "fs"] -GrepEngine = Literal["auto", "fs"] +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union from openviking.core.namespace import ( NamespaceShapeError, @@ -55,13 +52,14 @@ PermissionDeniedError, ) from openviking_cli.session.user_id import UserIdentifier +from openviking_cli.utils.config.grep_config import GrepEngine from openviking_cli.utils.logger import get_logger from openviking_cli.utils.uri import VikingURI if TYPE_CHECKING: from openviking.storage.transaction.lock_handle import LockHandle from openviking.storage.viking_vector_index_backend import VikingVectorIndexBackend - from openviking_cli.utils.config import RerankConfig, RetrievalConfig + from openviking_cli.utils.config import GrepConfig, RerankConfig, RetrievalConfig logger = get_logger(__name__) @@ -125,6 +123,7 @@ def init_viking_fs( rerank_config: Optional["RerankConfig"] = None, vector_store: Optional["VikingVectorIndexBackend"] = None, retrieval_config: Optional["RetrievalConfig"] = None, + grep_config: Optional["GrepConfig"] = None, timeout: int = 10, enable_recorder: bool = False, encryptor: Optional[Any] = None, @@ -133,10 +132,10 @@ def init_viking_fs( Args: agfs: Pre-initialized AGFS client (HTTP or Binding) - agfs_config: AGFS configuration object for backend settings query_embedder: Embedder instance rerank_config: Rerank configuration retrieval_config: Retrieval ranking configuration + grep_config: Grep engine configuration vector_store: Vector store instance enable_recorder: Whether to enable IO recording encryptor: FileEncryptor instance for encryption/decryption @@ -149,6 +148,7 @@ def init_viking_fs( rerank_config=rerank_config, vector_store=vector_store, retrieval_config=retrieval_config, + grep_config=grep_config, encryptor=encryptor, ) @@ -221,6 +221,7 @@ def __init__( rerank_config: Optional["RerankConfig"] = None, vector_store: Optional["VikingVectorIndexBackend"] = None, retrieval_config: Optional["RetrievalConfig"] = None, + grep_config: Optional["GrepConfig"] = None, timeout: int = 10, encryptor: Optional[Any] = None, ): @@ -230,6 +231,7 @@ def __init__( self.rerank_config = rerank_config self.vector_store = vector_store self.retrieval_config = retrieval_config + self.grep_config = grep_config self._encryptor = encryptor self._count_cache: Dict[str, tuple] = {} # cache_key → (count, timestamp) self._count_cache_max_size = 1024 @@ -692,9 +694,7 @@ async def grep( node_limit: Optional[int] = None, level_limit: int = 5, ctx: Optional[RequestContext] = None, - engine: GrepEngine = "auto", - switch_to_remote_threshold: int = 1000, - remote_return_limit: int = 100, + remote_return_limit: int = 0, ) -> Dict: """Content search by pattern or keywords. @@ -711,11 +711,9 @@ async def grep( node_limit: Maximum number of results to return level_limit: Maximum depth level to traverse (default: 5) ctx: Request context - engine: Search engine mode: "auto" (default) or "fs" - switch_to_remote_threshold: L2 record count threshold to switch to - vikingdb; 0 means always use vikingdb (default: 1000) - remote_return_limit: Maximum files recalled by vikingdb bm25 - (default: 100, max: 100000) + remote_return_limit: Maximum files recalled by vikingdb bm25. + 0 means auto-adapt: use maximum limit (100000) to avoid + truncating bm25 recall results (default: 0, max: 100000) Returns: Dict with matches, count, match_count, files_scanned @@ -723,8 +721,17 @@ async def grep( self._ensure_access(uri, ctx) await self.stat(uri, ctx=ctx) - # Clamp remote_return_limit to valid range - remote_return_limit = max(1, min(remote_return_limit, 100000)) + # Clamp remote_return_limit to valid range (0 = auto, 1-100000 = explicit) + if remote_return_limit < 0: + remote_return_limit = 0 + elif remote_return_limit > 0: + remote_return_limit = max(1, min(remote_return_limit, 100000)) + + # Read engine and threshold from grep_config (ov.conf) + engine = self.grep_config.engine if self.grep_config else "auto" + switch_to_remote_threshold = ( + self.grep_config.switch_to_remote_threshold if self.grep_config else 1000 + ) resolved_engine = await self._resolve_grep_engine( engine, uri, ctx, switch_to_remote_threshold @@ -886,17 +893,25 @@ async def _grep_vikingdb_then_fs( """VikingDB bm25 recall + local fs precise matching.""" vector_store = self._get_vector_store() + # Split regex alternation (e.g. "error|warning|fail") into individual keywords + # for bm25 search. Limit to 10 keywords per VikingDB API constraint. + keywords = [kw.strip() for kw in pattern.split("|") if kw.strip()][:10] + filter_expr = And( + [ + PathScope("uri", uri), + Eq("level", 2), + ] + ) + + # Auto-adapt remote_return_limit: when 0 (default), use the maximum + # limit to recall all bm25-matched candidates, ensuring no results are + # truncated by an arbitrary cap. The real cost is in phase 2 (local + # regex on recalled files), not in bm25 recall itself. + if remote_return_limit == 0: + remote_return_limit = 100000 + # Step 1: vikingdb recall candidate files try: - # Split regex alternation (e.g. "error|warning|fail") into individual keywords - # for bm25 search. Limit to 10 keywords per VikingDB API constraint. - keywords = [kw.strip() for kw in pattern.split("|") if kw.strip()][:10] - filter_expr = And( - [ - PathScope("uri", uri), - Eq("level", 2), - ] - ) result = await vector_store.search_by_keywords( keywords=keywords, mode="bm25", diff --git a/openviking/sync_client.py b/openviking/sync_client.py index 9140da869d..00182872d1 100644 --- a/openviking/sync_client.py +++ b/openviking/sync_client.py @@ -8,8 +8,6 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union -from openviking.storage.viking_fs import GrepEngine - if TYPE_CHECKING: from openviking.session import Session @@ -368,9 +366,8 @@ def grep( case_insensitive: bool = False, node_limit: Optional[int] = None, exclude_uri: Optional[str] = None, - engine: GrepEngine = "auto", - switch_to_remote_threshold: int = 1000, - remote_return_limit: int = 100, + level_limit: int = 5, + remote_return_limit: int = 0, ) -> Dict: """Content search""" return run_async( @@ -380,8 +377,7 @@ def grep( case_insensitive, node_limit, exclude_uri, - engine, - switch_to_remote_threshold, + level_limit, remote_return_limit, ) ) diff --git a/openviking_cli/client/base.py b/openviking_cli/client/base.py index 7068a9eb6c..cdd7a5a8b3 100644 --- a/openviking_cli/client/base.py +++ b/openviking_cli/client/base.py @@ -8,8 +8,6 @@ from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional, Union -from openviking.storage.viking_fs import GrepEngine - from openviking.telemetry import TelemetryRequest @@ -196,9 +194,8 @@ async def grep( case_insensitive: bool = False, exclude_uri: Optional[str] = None, node_limit: Optional[int] = None, - engine: GrepEngine = "auto", - switch_to_remote_threshold: int = 1000, - remote_return_limit: int = 100, + level_limit: int = 5, + remote_return_limit: int = 0, ) -> Dict[str, Any]: """Content search with pattern.""" ... diff --git a/openviking_cli/client/http.py b/openviking_cli/client/http.py index cb729a89a1..b42822c989 100644 --- a/openviking_cli/client/http.py +++ b/openviking_cli/client/http.py @@ -11,8 +11,6 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Union -from openviking.storage.viking_fs import GrepEngine - import httpx from openviking.telemetry import TelemetryRequest, normalize_telemetry_request @@ -742,9 +740,8 @@ async def grep( case_insensitive: bool = False, node_limit: Optional[int] = None, exclude_uri: Optional[str] = None, - engine: GrepEngine = "auto", - switch_to_remote_threshold: int = 1000, - remote_return_limit: int = 100, + level_limit: int = 5, + remote_return_limit: int = 0, ) -> Dict[str, Any]: """Content search with pattern.""" uri = VikingURI.normalize(uri) @@ -752,8 +749,7 @@ async def grep( "uri": uri, "pattern": pattern, "case_insensitive": case_insensitive, - "engine": engine, - "switch_to_remote_threshold": switch_to_remote_threshold, + "level_limit": level_limit, "remote_return_limit": remote_return_limit, } if node_limit is not None: diff --git a/openviking_cli/client/sync_http.py b/openviking_cli/client/sync_http.py index 52f82edda4..886318752d 100644 --- a/openviking_cli/client/sync_http.py +++ b/openviking_cli/client/sync_http.py @@ -7,7 +7,6 @@ from typing import Any, Dict, List, Optional, Union -from openviking.storage.viking_fs import GrepEngine from openviking.telemetry import TelemetryRequest from openviking_cli.client.http import AsyncHTTPClient from openviking_cli.utils import run_async @@ -314,9 +313,8 @@ def grep( case_insensitive: bool = False, node_limit: Optional[int] = None, exclude_uri: Optional[str] = None, - engine: GrepEngine = "auto", - switch_to_remote_threshold: int = 1000, - remote_return_limit: int = 100, + level_limit: int = 5, + remote_return_limit: int = 0, ) -> Dict: """Content search with pattern.""" return run_async( @@ -326,8 +324,7 @@ def grep( case_insensitive, node_limit, exclude_uri, - engine, - switch_to_remote_threshold, + level_limit, remote_return_limit, ) ) diff --git a/openviking_cli/utils/config/__init__.py b/openviking_cli/utils/config/__init__.py index 64a5a6489d..9e3385d921 100644 --- a/openviking_cli/utils/config/__init__.py +++ b/openviking_cli/utils/config/__init__.py @@ -51,6 +51,7 @@ SYSTEM_CONFIG_DIR, ) from .embedding_config import EmbeddingConfig +from .grep_config import GrepConfig, GrepEngine from .log_config import LogConfig from .open_viking_config import ( OpenVikingConfig, @@ -131,6 +132,8 @@ "OpenVikingConfig", "OpenVikingConfigSingleton", "OVCLIConfig", + "GrepConfig", + "GrepEngine", "RerankConfig", "RetrievalConfig", "StorageConfig", diff --git a/openviking_cli/utils/config/grep_config.py b/openviking_cli/utils/config/grep_config.py new file mode 100644 index 0000000000..9e2564fe90 --- /dev/null +++ b/openviking_cli/utils/config/grep_config.py @@ -0,0 +1,31 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 + +from typing import Literal + +from pydantic import BaseModel, Field + +# Grep engine mode type alias — import this instead of repeating Literal["auto", "fs"] +GrepEngine = Literal["auto", "fs"] + + +class GrepConfig(BaseModel): + """Configuration for grep engine behavior.""" + + engine: GrepEngine = Field( + default="auto", + description=( + "Search engine mode: 'auto' uses vikingdb bm25 recall when available, " + "'fs' forces local filesystem search." + ), + ) + + switch_to_remote_threshold: int = Field( + default=1000, + ge=0, + description=( + "L2 record count threshold to switch to vikingdb; 0 means always use vikingdb." + ), + ) + + model_config = {"extra": "forbid"} diff --git a/openviking_cli/utils/config/open_viking_config.py b/openviking_cli/utils/config/open_viking_config.py index 05d7958f97..cad7c415d4 100644 --- a/openviking_cli/utils/config/open_viking_config.py +++ b/openviking_cli/utils/config/open_viking_config.py @@ -21,6 +21,7 @@ ) from .embedding_config import EmbeddingConfig from .encryption_config import EncryptionConfig +from .grep_config import GrepConfig from .log_config import LogConfig from .memory_config import MemoryConfig from .oauth_config import OAuthConfig @@ -84,6 +85,11 @@ class OpenVikingConfig(BaseModel): description="Retrieval ranking configuration", ) + grep: GrepConfig = Field( + default_factory=GrepConfig, + description="Grep engine configuration", + ) + # Encryption configuration encryption: EncryptionConfig = Field( default_factory=EncryptionConfig, description="Encryption configuration" From 3ea7b3f8a479e2c232583dfaffa7bea188230fbc Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Tue, 26 May 2026 17:30:55 +0800 Subject: [PATCH 10/31] optimize: auto adapt remote_return_limit by agg API; rm unnecessary params in keywords search --- .../vikingdb_bm25/step5_retrieval_quality.py | 273 ++++++++++++++++++ .../storage/vectordb/collection/collection.py | 8 +- .../vectordb/collection/http_collection.py | 7 +- .../vectordb/collection/local_collection.py | 4 - .../collection/vikingdb_collection.py | 4 - .../volcengine_api_key_collection.py | 4 - .../collection/volcengine_collection.py | 4 - .../storage/vectordb/service/app_models.py | 2 - openviking/storage/vectordb_adapters/base.py | 4 - openviking/storage/viking_fs.py | 8 +- .../storage/viking_vector_index_backend.py | 8 - tests/storage/mock_backend.py | 2 - 12 files changed, 278 insertions(+), 50 deletions(-) create mode 100644 benchmark/retrieval/grep/vikingdb_bm25/step5_retrieval_quality.py diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step5_retrieval_quality.py b/benchmark/retrieval/grep/vikingdb_bm25/step5_retrieval_quality.py new file mode 100644 index 0000000000..fbd4e3bef2 --- /dev/null +++ b/benchmark/retrieval/grep/vikingdb_bm25/step5_retrieval_quality.py @@ -0,0 +1,273 @@ +#!/usr/bin/env python3 +"""Step 5: Retrieval quality evaluation — compare auto (bm25) vs fs grep. + +Prerequisites: + 1. Run step1_generate.py to create test data + 2. Run step2_quick_add_resource.py to upload files + 3. Run step3_build_index.py to build index (embedding + content) + 4. Ensure ov.conf has: + "grep": {"engine": "auto", "switch_to_remote_threshold": 0} + (switch_to_remote_threshold = 0 forces VikingDB BM25 for all queries) + 5. Restart the server after changing ov.conf + +Approach: + - Ground truth: scan local benchmark files with Python regex (equivalent to fs engine) + - Test: call `ov grep` CLI with --output json to get structured results + - Compare: compute Recall, Precision, F1 per query pattern + +NOTE: `remote_return_limit` defaults to 0 (auto-adapt to 100000), so bm25 recall +is not truncated. No need to test different limit values. + +Usage: + python3 step5_retrieval_quality.py [--uri URI] [--case-insensitive] [--output FILE] +""" + +import argparse +import json +import os +import re +import shlex +import subprocess +import time + +BASE_URI = "viking://resources/benchmark" +OV_CMD = ["ov", "--account", "default", "--user", "default"] +DATA_DIR = os.path.expanduser("~/.openviking/data") +BENCHMARK_DIR = os.path.join(DATA_DIR, "benchmark") + +# Test patterns covering different keyword types +# (label, pattern) +TEST_PATTERNS = [ + # CamelCase + ("CamelCase: VikingDB", "VikingDB"), + # PascalCase + ("PascalCase: FullText", "FullText"), + # lowercase + ("lowercase: bm25", "bm25"), + # snake_case + ("snake_case: search_by_keywords", "search_by_keywords"), + # Multi-keyword regex + ("multi: VikingDB|FullText", "VikingDB|FullText"), + ("multi: VikingDB|FullText|bm25", "VikingDB|FullText|bm25"), + # No-match + ("no-match: zzz_nonexistent", "zzz_nonexistent"), +] + + +def run_ov_grep(uri: str, pattern: str, case_insensitive: bool = False) -> tuple[set[str], float]: + """Run `ov grep --output json` and extract matched URIs.""" + cmd = OV_CMD + [ + "--output", + "json", + "grep", + "--uri", + uri, + "-n", + "100000", + pattern, + ] + if case_insensitive: + cmd.insert(cmd.index("grep") + 1, "-i") + + cmd_str = shlex.join(cmd) + print(f" $ {cmd_str}") + + t0 = time.monotonic() + result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) + elapsed = time.monotonic() - t0 + + if result.returncode != 0: + stderr = result.stderr.strip()[:200] + raise RuntimeError(f"ov grep failed (exit={result.returncode}): {stderr}") + + # Parse JSON response: {"status": "ok", "result": {"matches": [...], ...}} + try: + resp = json.loads(result.stdout) + except json.JSONDecodeError as e: + raise RuntimeError(f"Failed to parse ov grep output: {e}") from e + + grep_result = resp.get("result", resp) + uris = set() + for match in grep_result.get("matches", []): + uri = match.get("uri", "") + if uri: + uris.add(uri.rstrip("/")) + return uris, elapsed + + +def local_path_to_viking_uri(filepath: str) -> str: + """Convert a local benchmark file path to a viking URI.""" + rel = os.path.relpath(filepath, DATA_DIR) + return "viking://resources/" + rel.replace(os.sep, "/").rstrip("/") + + +def compute_ground_truth(pattern: str, case_insensitive: bool = False) -> tuple[set[str], float]: + """Scan local benchmark files with Python regex to get ground truth.""" + flags = re.IGNORECASE if case_insensitive else 0 + compiled = re.compile(pattern, flags) + truth_uris = set() + t0 = time.monotonic() + for root, dirs, files in os.walk(BENCHMARK_DIR): + dirs.sort() + for fname in sorted(files): + if not fname.endswith(".md"): + continue + filepath = os.path.join(root, fname) + try: + with open(filepath) as f: + content = f.read() + if compiled.search(content): + truth_uris.add(local_path_to_viking_uri(filepath)) + except Exception: + pass + elapsed = time.monotonic() - t0 + return truth_uris, elapsed + + +def compute_metrics(truth: set[str], predicted: set[str]) -> dict: + """Compute recall, precision, F1.""" + if not truth and not predicted: + return {"recall": 1.0, "precision": 1.0, "f1": 1.0, "tp": 0, "fp": 0, "fn": 0} + if not truth: + return {"recall": 0.0, "precision": 0.0, "f1": 0.0, "tp": 0, "fp": len(predicted), "fn": 0} + + tp = len(truth & predicted) + fp = len(predicted - truth) + fn = len(truth - predicted) + recall = tp / len(truth) + precision = tp / len(predicted) if predicted else 0.0 + f1 = 2 * recall * precision / (recall + precision) if (recall + precision) > 0 else 0.0 + return {"recall": recall, "precision": precision, "f1": f1, "tp": tp, "fp": fp, "fn": fn} + + +def main(): + parser = argparse.ArgumentParser(description="Step 5: Retrieval quality evaluation") + parser.add_argument("--uri", default=BASE_URI, help=f"Base URI to search (default: {BASE_URI})") + parser.add_argument("--case-insensitive", action="store_true", help="Case-insensitive matching") + parser.add_argument( + "--output", default=None, help="Output JSON file path (default: print to stdout only)" + ) + args = parser.parse_args() + + if not os.path.isdir(BENCHMARK_DIR): + print(f"Error: Benchmark data not found at {BENCHMARK_DIR}") + print("Run step1_generate.py first.") + return + + print("=" * 100) + print("Retrieval Quality Evaluation: auto (bm25+fs) vs local fs (ground truth)") + print("=" * 100) + print(f"URI: {args.uri}") + print(f"Case insensitive: {args.case_insensitive}") + print(f"Data dir: {BENCHMARK_DIR}") + print() + print("Ensure ov.conf has:") + print(' "grep": {"engine": "auto", "switch_to_remote_threshold": 0}') + print("And the server has been restarted.") + print() + + results = [] + + for label, pattern in TEST_PATTERNS: + print(f"--- {label} (pattern: {pattern}) ---") + + # Ground truth: scan local files + truth_uris, fs_elapsed = compute_ground_truth(pattern, args.case_insensitive) + print(f" Ground truth (local fs): {len(truth_uris)} matches ({fs_elapsed:.2f}s)") + + # Auto grep (via ov CLI with --output json) + try: + auto_uris, auto_elapsed = run_ov_grep(args.uri, pattern, args.case_insensitive) + except Exception as e: + print(f" Auto grep FAILED: {e}") + results.append( + { + "label": label, + "pattern": pattern, + "error": str(e), + "truth_count": len(truth_uris), + } + ) + continue + print(f" Auto grep (bm25+fs): {len(auto_uris)} matches ({auto_elapsed:.2f}s)") + + # Compute metrics + metrics = compute_metrics(truth_uris, auto_uris) + print( + f" Recall: {metrics['recall']:.4f} " + f"Precision: {metrics['precision']:.4f} " + f"F1: {metrics['f1']:.4f}" + ) + if metrics["fn"] > 0: + print(f" Missed (FN): {metrics['fn']}") + if metrics["fp"] > 0: + print(f" Extra (FP): {metrics['fp']}") + + # Show sample missed URIs for debugging + if metrics["fn"] > 0: + missed = sorted(truth_uris - auto_uris)[:5] + print(" Sample missed URIs:") + for u in missed: + print(f" {u}") + + results.append( + { + "label": label, + "pattern": pattern, + "truth_count": len(truth_uris), + "auto_count": len(auto_uris), + "fs_elapsed_s": round(fs_elapsed, 3), + "auto_elapsed_s": round(auto_elapsed, 3), + **metrics, + } + ) + + # Summary table + print() + print("=" * 110) + print( + f"{'Label':<40} {'Truth':>6} {'Auto':>6} {'Recall':>8} {'Prec':>8} {'F1':>8} {'Missed':>8}" + ) + print("-" * 110) + for r in results: + if "error" in r: + print( + f"{r['label']:<40} {r['truth_count']:>6} {'ERR':>6} " + f"{'---':>8} {'---':>8} {'---':>8} {'---':>8}" + ) + else: + print( + f"{r['label']:<40} {r['truth_count']:>6} {r['auto_count']:>6} " + f"{r['recall']:>8.4f} {r['precision']:>8.4f} {r['f1']:>8.4f} {r['fn']:>8}" + ) + print() + + # Verdict + has_recall_loss = any(r.get("fn", 0) > 0 for r in results) + has_precision_loss = any(r.get("fp", 0) > 0 for r in results) + if not has_recall_loss and not has_precision_loss: + print( + "VERDICT: All queries achieved perfect recall and precision. bm25 recall is complete." + ) + else: + if has_recall_loss: + print("VERDICT: Recall loss detected — some files not recalled by bm25.") + print( + " Possible causes: content field truncation, tokenizer mismatch, or incomplete reindex." + ) + if has_precision_loss: + print("VERDICT: Precision loss detected — unexpected matches in auto results.") + print( + " This should not happen (phase 2 regex guarantees precision). Investigate URI format." + ) + print() + + # Save results + if args.output: + with open(args.output, "w") as f: + json.dump(results, f, indent=2) + print(f"Results saved to {args.output}") + + +if __name__ == "__main__": + main() diff --git a/openviking/storage/vectordb/collection/collection.py b/openviking/storage/vectordb/collection/collection.py index ed25ce04b7..4d50cdafef 100644 --- a/openviking/storage/vectordb/collection/collection.py +++ b/openviking/storage/vectordb/collection/collection.py @@ -73,8 +73,6 @@ def search_by_keywords( offset: int = 0, filters: Optional[Dict[str, Any]] = None, output_fields: Optional[List[str]] = None, - mode: Optional[str] = None, - fields: Optional[List[str]] = None, ) -> SearchResult: raise NotImplementedError @@ -350,8 +348,6 @@ def search_by_keywords( offset: int = 0, filters: Optional[Dict[str, Any]] = None, output_fields: Optional[List[str]] = None, - mode: Optional[str] = None, - fields: Optional[List[str]] = None, ): """Search by keywords or query string using vectorization. @@ -364,8 +360,6 @@ def search_by_keywords( filters (Optional[Dict[str, Any]]): Query filters to narrow down results. Defaults to None. output_fields (Optional[List[str]]): List of field names to include in results. If None, returns all fields. Defaults to None. - mode (Optional[str]): Search mode, e.g. "bm25" for full-text search. Defaults to None. - fields (Optional[List[str]]): Text fields to search in (for bm25 mode). Defaults to None. Returns: SearchResult: Search results containing matching documents with scores and field values. @@ -377,7 +371,7 @@ def search_by_keywords( if self.__collection is None: raise RuntimeError("Collection is closed") return self.__collection.search_by_keywords( - index_name, keywords, query, limit, offset, filters, output_fields, mode, fields + index_name, keywords, query, limit, offset, filters, output_fields ) def search_by_id( diff --git a/openviking/storage/vectordb/collection/http_collection.py b/openviking/storage/vectordb/collection/http_collection.py index 8b831ed0a9..63452da5c7 100644 --- a/openviking/storage/vectordb/collection/http_collection.py +++ b/openviking/storage/vectordb/collection/http_collection.py @@ -6,6 +6,7 @@ import requests +import openviking from openviking.storage.vectordb.collection.collection import Collection, ICollection from openviking.storage.vectordb.collection.result import ( AggregateResult, @@ -15,8 +16,6 @@ SearchResult, ) -import openviking - # Default request timeout (seconds) DEFAULT_TIMEOUT = 30 @@ -541,8 +540,6 @@ def search_by_keywords( offset: int = 0, filters: Optional[Dict[str, Any]] = None, output_fields: Optional[List[str]] = None, - mode: Optional[str] = None, - fields: Optional[List[str]] = None, ) -> SearchResult: url = self.url_prefix + "api/vikingdb/data/search/keywords" payload = { @@ -555,8 +552,6 @@ def search_by_keywords( "output_fields": json.dumps(output_fields) if output_fields else None, "limit": limit, "offset": offset, - "mode": mode, - "fields": json.dumps(fields) if fields else None, } payload = {k: v for k, v in payload.items() if v is not None} response = requests.post( diff --git a/openviking/storage/vectordb/collection/local_collection.py b/openviking/storage/vectordb/collection/local_collection.py index 5285b83382..5d1797dea7 100644 --- a/openviking/storage/vectordb/collection/local_collection.py +++ b/openviking/storage/vectordb/collection/local_collection.py @@ -436,8 +436,6 @@ def search_by_keywords( offset: int = 0, filters: Optional[Dict[str, Any]] = None, output_fields: Optional[List[str]] = None, - mode: Optional[str] = None, - fields: Optional[List[str]] = None, ) -> SearchResult: """Search by keywords by generating vectors and calling search_by_vector. @@ -449,8 +447,6 @@ def search_by_keywords( offset: Number of results to skip filters: Filter conditions output_fields: List of fields to return - mode: Search mode (ignored for local backend) - fields: Text fields to search (ignored for local backend) Returns: SearchResult: Search results diff --git a/openviking/storage/vectordb/collection/vikingdb_collection.py b/openviking/storage/vectordb/collection/vikingdb_collection.py index ddeaa1e7c5..c8a7d64736 100644 --- a/openviking/storage/vectordb/collection/vikingdb_collection.py +++ b/openviking/storage/vectordb/collection/vikingdb_collection.py @@ -322,8 +322,6 @@ def search_by_keywords( offset: int = 0, filters: Optional[Dict[str, Any]] = None, output_fields: Optional[List[str]] = None, - mode: Optional[str] = None, - fields: Optional[List[str]] = None, ) -> SearchResult: path = "/api/vikingdb/data/search/keywords" data = { @@ -336,8 +334,6 @@ def search_by_keywords( "output_fields": output_fields, "limit": limit, "offset": offset, - "mode": mode, - "fields": fields, } data = {k: v for k, v in data.items() if v is not None} resp_data = self._data_post(path, data) diff --git a/openviking/storage/vectordb/collection/volcengine_api_key_collection.py b/openviking/storage/vectordb/collection/volcengine_api_key_collection.py index f432d683ee..29af76b2a4 100644 --- a/openviking/storage/vectordb/collection/volcengine_api_key_collection.py +++ b/openviking/storage/vectordb/collection/volcengine_api_key_collection.py @@ -361,8 +361,6 @@ def search_by_keywords( offset: int = 0, filters: Optional[Dict[str, Any]] = None, output_fields: Optional[List[str]] = None, - mode: Optional[str] = None, - fields: Optional[List[str]] = None, ) -> SearchResult: path = "/api/vikingdb/data/search/keywords" data = { @@ -373,8 +371,6 @@ def search_by_keywords( "output_fields": output_fields, "limit": limit, "offset": offset, - "mode": mode, - "fields": fields, } data = {k: v for k, v in data.items() if v is not None} resp_data = self._data_post(path, data) diff --git a/openviking/storage/vectordb/collection/volcengine_collection.py b/openviking/storage/vectordb/collection/volcengine_collection.py index 22485ae0d3..45e6009451 100644 --- a/openviking/storage/vectordb/collection/volcengine_collection.py +++ b/openviking/storage/vectordb/collection/volcengine_collection.py @@ -560,8 +560,6 @@ def search_by_keywords( offset: int = 0, filters: Optional[Dict[str, Any]] = None, output_fields: Optional[List[str]] = None, - mode: Optional[str] = None, - fields: Optional[List[str]] = None, ) -> SearchResult: path = "/api/vikingdb/data/search/keywords" data = { @@ -574,8 +572,6 @@ def search_by_keywords( "output_fields": output_fields, "limit": limit, "offset": offset, - "mode": mode, - "fields": fields, } data = {k: v for k, v in data.items() if v is not None} resp_data = self._data_post(path, data) diff --git a/openviking/storage/vectordb/service/app_models.py b/openviking/storage/vectordb/service/app_models.py index d15a617d35..81a622c609 100644 --- a/openviking/storage/vectordb/service/app_models.py +++ b/openviking/storage/vectordb/service/app_models.py @@ -167,8 +167,6 @@ class SearchByKeywordsRequest(BaseModel): output_fields: Optional[Any] = Field(None, description="Output fields") limit: Optional[int] = Field(10, description="Result limit") offset: Optional[int] = Field(0, description="Result offset") - mode: Optional[str] = Field(None, description="Search mode, e.g. bm25") - fields: Optional[Any] = Field(None, description="Text fields to search in") # ==================== Response Model ==================== diff --git a/openviking/storage/vectordb_adapters/base.py b/openviking/storage/vectordb_adapters/base.py index fc9806e005..f82275029a 100644 --- a/openviking/storage/vectordb_adapters/base.py +++ b/openviking/storage/vectordb_adapters/base.py @@ -531,8 +531,6 @@ def search_by_keywords( offset: int = 0, filter: Optional[Dict[str, Any] | FilterExpr] = None, output_fields: Optional[list[str]] = None, - mode: Optional[str] = None, - fields: Optional[list[str]] = None, ) -> list[Dict[str, Any]]: coll = self.get_collection() result = coll.search_by_keywords( @@ -543,8 +541,6 @@ def search_by_keywords( offset=offset, filters=self._compile_filter(filter), output_fields=output_fields, - mode=mode, - fields=fields, ) records: list[Dict[str, Any]] = [] for item in result.data: diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index 54a6e59452..9952d90111 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -904,9 +904,9 @@ async def _grep_vikingdb_then_fs( ) # Auto-adapt remote_return_limit: when 0 (default), use the maximum - # limit to recall all bm25-matched candidates, ensuring no results are - # truncated by an arbitrary cap. The real cost is in phase 2 (local - # regex on recalled files), not in bm25 recall itself. + # limit (100000) so that search_by_keywords returns all matching + # documents without truncation. The real cost is in phase 2 (local + # regex on recalled files), not in the bm25 recall itself. if remote_return_limit == 0: remote_return_limit = 100000 @@ -914,8 +914,6 @@ async def _grep_vikingdb_then_fs( try: result = await vector_store.search_by_keywords( keywords=keywords, - mode="bm25", - fields=["content"], limit=remote_return_limit, filter=filter_expr, output_fields=["uri"], diff --git a/openviking/storage/viking_vector_index_backend.py b/openviking/storage/viking_vector_index_backend.py index 1513bac656..523dcb54d2 100644 --- a/openviking/storage/viking_vector_index_backend.py +++ b/openviking/storage/viking_vector_index_backend.py @@ -507,8 +507,6 @@ async def search_by_keywords( offset: int = 0, filter: Optional[Dict[str, Any] | FilterExpr] = None, output_fields: Optional[List[str]] = None, - mode: Optional[str] = None, - fields: Optional[List[str]] = None, ) -> List[Dict[str, Any]]: try: if self._bound_account_id: @@ -528,8 +526,6 @@ async def search_by_keywords( offset=offset, filter=filter, output_fields=output_fields, - mode=mode, - fields=fields, ) except Exception as e: logger.error("Error searching by keywords: %s", e) @@ -828,8 +824,6 @@ async def search_by_keywords( offset: int = 0, filter: Optional[Dict[str, Any] | FilterExpr] = None, output_fields: Optional[List[str]] = None, - mode: Optional[str] = None, - fields: Optional[List[str]] = None, *, ctx: Optional[RequestContext] = None, ) -> List[Dict[str, Any]]: @@ -844,8 +838,6 @@ async def search_by_keywords( offset=offset, filter=filter, output_fields=output_fields, - mode=mode, - fields=fields, ) async def clear(self, *, ctx: Optional[RequestContext] = None) -> bool: diff --git a/tests/storage/mock_backend.py b/tests/storage/mock_backend.py index 7dd67d919c..07e28d50fd 100644 --- a/tests/storage/mock_backend.py +++ b/tests/storage/mock_backend.py @@ -110,8 +110,6 @@ def search_by_keywords( offset: int = 0, filters: Optional[Dict[str, Any]] = None, output_fields: Optional[List[str]] = None, - mode: Optional[str] = None, - fields: Optional[List[str]] = None, ) -> SearchResult: raise NotImplementedError("MockCollection.search_by_keywords is not supported") From 1c65d648e85eca84056a73d48ede5595f34dc867 Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Wed, 27 May 2026 14:39:53 +0800 Subject: [PATCH 11/31] fix: adjust benchmark scripts --- .../grep/vikingdb_bm25/step4_benchmark.py | 264 ++++++++++++------ .../vikingdb_bm25/step5_retrieval_quality.py | 47 ++-- 2 files changed, 202 insertions(+), 109 deletions(-) diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step4_benchmark.py b/benchmark/retrieval/grep/vikingdb_bm25/step4_benchmark.py index 1f14e85e1e..b45f068a5e 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/step4_benchmark.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/step4_benchmark.py @@ -1,139 +1,239 @@ #!/usr/bin/env python3 -"""Step 4: Benchmark grep performance: pure fs vs vikingdb bm25 + fs. +"""Step 4: Benchmark grep performance for the current engine config. Prerequisites: 1. Run step1_generate.py to create test data 2. Run step2_quick_add_resource.py to upload files (skip VLM+embedding) - 3. Run step3_build_index.py to build index (embedding) + 3. Run step3_build_index.py to build index (embedding + content) + 4. Set ov.conf grep config and restart the server -NOTE: `engine` and `switch_to_remote_threshold` are now server-side config -(ov.conf `[grep]` section). To benchmark different engines, update ov.conf -and restart the server before each run. The default config uses engine="auto" -with switch_to_remote_threshold=1000; set switch_to_remote_threshold=0 to -force VikingDB bm25 recall. +NOTE: `engine` and `switch_to_remote_threshold` are server-side config +(ov.conf `grep` section). To benchmark different engines, update ov.conf +and restart the server before each run. Usage: - python3 step4_benchmark.py [--runs N] [--warmup N] - -Outputs a comparison table of elapsed time and match count for each query. + # Run 1: benchmark with fs engine + # 1. Set ov.conf: "grep": {"engine": "fs"} + # 2. Restart server + python3 step4_benchmark.py --engine-label fs + + # Run 2: benchmark with auto engine (bm25) + # 1. Set ov.conf: "grep": {"engine": "auto", "switch_to_remote_threshold": 0} + # 2. Restart server + python3 step4_benchmark.py --engine-label auto --compare step4_result_fs.json + +Results are saved to step4_result_{engine_label}.json. +When --compare is given, a side-by-side comparison table is printed. """ import argparse -import shlex +import json +import os import subprocess import time BASE_URI = "viking://resources/benchmark" OV_CMD = ["ov", "--account", "default", "--user", "default"] +RUNS = 3 +WARMUP = 1 -# Test cases: (label, pattern, extra_args) -# extra_args can override --uri; if present, the default --uri is omitted. +# Test cases: (label, pattern, uri) TEST_CASES = [ # --- Single keyword --- - ("single keyword (VikingDB)", "VikingDB", []), - ("single keyword (FullText)", "FullText", []), + ("single keyword (VikingDB)", "VikingDB", BASE_URI), + ("single keyword (FullText)", "FullText", BASE_URI), # --- Multi-keyword (regex alternation) --- - ("2 keywords (VikingDB|FullText)", "VikingDB|FullText", []), - ("3 keywords (VikingDB|FullText|bm25)", "VikingDB|FullText|bm25", []), + ("2 keywords (VikingDB|FullText)", "VikingDB|FullText", BASE_URI), + ("3 keywords (VikingDB|FullText|bm25)", "VikingDB|FullText|bm25", BASE_URI), # --- Rare keyword (lower hit count) --- - ("rare keyword (search_by_keywords)", "search_by_keywords", []), + ("rare keyword (search_by_keywords)", "search_by_keywords", BASE_URI), # --- Non-existent keyword (0 matches) --- - ("no-match 1 keyword (zzz_nonexistent)", "zzz_nonexistent", []), - ("no-match 2 keywords (zzz_a|zzz_b)", "zzz_a|zzz_b", []), - ("no-match 3 keywords (zzz_a|zzz_b|zzz_c)", "zzz_a|zzz_b|zzz_c", []), + ("no-match 1 keyword (zzz_nonexistent)", "zzz_nonexistent", BASE_URI), + ("no-match 2 keywords (zzz_a|zzz_b)", "zzz_a|zzz_b", BASE_URI), + ("no-match 3 keywords (zzz_a|zzz_b|zzz_c)", "zzz_a|zzz_b|zzz_c", BASE_URI), # --- Subdirectory scope (~8K files per level0 dir) --- - ("subdir level0_00, VikingDB (~8K files)", "VikingDB", ["--uri", f"{BASE_URI}/level0_00"]), - ( - "subdir level0_00, no-match (~8K files)", - "zzz_nonexistent", - ["--uri", f"{BASE_URI}/level0_00"], - ), + ("subdir level0_00, VikingDB (~8K files)", "VikingDB", f"{BASE_URI}/level0_00"), + ("subdir level0_00, no-match (~8K files)", "zzz_nonexistent", f"{BASE_URI}/level0_00"), ] -def _has_uri_arg(extra_args: list) -> bool: - """Check if extra_args contains --uri.""" - return "--uri" in extra_args - - -def run_grep(pattern: str, extra_args: list) -> tuple[float, int, str, str]: - """Run a single grep command, return (elapsed_seconds, match_count, stdout, stderr).""" - cmd = OV_CMD + ["grep"] - if not _has_uri_arg(extra_args): - cmd += ["--uri", BASE_URI] - cmd += extra_args + [pattern] - - cmd_str = shlex.join(cmd) - print(f" $ {cmd_str}") +def run_grep(pattern: str, uri: str) -> tuple[float, int]: + """Run a single grep command, return (elapsed_seconds, match_count).""" + cmd = OV_CMD + ["--output", "json", "grep", "--uri", uri, "-n", "100000", pattern] start = time.monotonic() - result = subprocess.run(cmd, capture_output=True, text=True) + result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) elapsed = time.monotonic() - start + if result.returncode != 0: + raise RuntimeError( + f"ov grep failed (exit={result.returncode}): {result.stderr.strip()[:200]}" + ) + + # Find JSON line in stdout (skip echo_command output) + json_line = None + for line in result.stdout.strip().splitlines(): + line = line.strip() + if line.startswith("{"): + json_line = line + break + match_count = 0 - if result.stdout: - match_count = len([l for l in result.stdout.strip().splitlines() if l.strip()]) + if json_line: + try: + resp = json.loads(json_line) + grep_result = resp.get("result", resp) + match_count = len(grep_result.get("matches", [])) + except json.JSONDecodeError: + pass - return elapsed, match_count, result.stdout, result.stderr + return elapsed, match_count -def main(): - parser = argparse.ArgumentParser(description="Benchmark grep: fs vs bm25") - parser.add_argument( - "--runs", type=int, default=3, help="Number of runs per test case (default: 3)" - ) - parser.add_argument( - "--warmup", type=int, default=1, help="Warmup runs before measuring (default: 1)" - ) - args = parser.parse_args() +def benchmark_engine(engine_label: str) -> list[dict]: + """Run all test cases for the current engine config.""" + results = [] - print(f"{'Label':<50} {'Avg(ms)':<10} {'Min(ms)':<10} {'Max(ms)':<10}") - print("-" * 88) + for label, pattern, uri in TEST_CASES: + print(f" {label} ...", end=" ", flush=True) - for label, pattern, extra_args in TEST_CASES: - # Warmup runs - for _ in range(args.warmup): + # Warmup + for _ in range(WARMUP): try: - run_grep(pattern, extra_args) + run_grep(pattern, uri) except Exception: - break + pass # Measured runs times = [] - last_stdout = "" - last_stderr = "" + match_count = 0 failed = False - for _ in range(args.runs): + for _ in range(RUNS): try: - elapsed, matches, stdout, stderr = run_grep(pattern, extra_args) + elapsed, matches = run_grep(pattern, uri) times.append(elapsed) - last_stdout = stdout - last_stderr = stderr - except Exception: + match_count = matches + except Exception as e: failed = True + print(f"FAILED ({e})") break if failed: - print(f"{label:<50} FAILED") - elif not times: - print(f"{label:<50} NO DATA") + results.append({"label": label, "pattern": pattern, "uri": uri, "error": True}) else: avg_ms = sum(times) / len(times) * 1000 min_ms = min(times) * 1000 max_ms = max(times) * 1000 - print(f"{label:<50} {avg_ms:<10.1f} {min_ms:<10.1f} {max_ms:<10.1f}") - - # Print output from last run (compact) - if last_stdout.strip(): - for line in last_stdout.strip().splitlines()[:3]: - print(f" {line}") - if len(last_stdout.strip().splitlines()) > 3: - print(f" ... ({len(last_stdout.strip().splitlines())} lines total)") - if last_stderr.strip(): - for line in last_stderr.strip().splitlines()[:2]: - print(f" [stderr] {line}") + print(f"avg={avg_ms:.1f}ms min={min_ms:.1f}ms matches={match_count}") + results.append( + { + "label": label, + "pattern": pattern, + "uri": uri, + "avg_ms": round(avg_ms, 1), + "min_ms": round(min_ms, 1), + "max_ms": round(max_ms, 1), + "matches": match_count, + } + ) + + return results + + +def print_comparison( + current_label: str, current: list[dict], compare_label: str, compare: list[dict] +): + """Print side-by-side comparison table.""" + # Build lookup by label + compare_by_label = {} + for r in compare: + if "error" not in r: + compare_by_label[r["label"]] = r print() + print("=" * 110) + print(f" Comparison: {compare_label} vs {current_label}") + print("=" * 110) + print( + f"{'Label':<50} {compare_label + '(ms)':>12} {current_label + '(ms)':>12} {'speedup':>10}" + ) + print("-" * 110) + + for r in current: + label = r["label"] + if "error" in r: + print(f"{label:<50} {'ERR':>12} {'ERR':>12} {'---':>10}") + continue + cur_ms = r["avg_ms"] + cmp = compare_by_label.get(label) + if not cmp: + print(f"{label:<50} {'N/A':>12} {cur_ms:>12.1f} {'---':>10}") + continue + cmp_ms = cmp["avg_ms"] + if cur_ms > 0: + speedup = cmp_ms / cur_ms + speedup_str = f"{speedup:.1f}x" + else: + speedup_str = "inf" + print(f"{label:<50} {cmp_ms:>12.1f} {cur_ms:>12.1f} {speedup_str:>10}") + + print() + + +def main(): + parser = argparse.ArgumentParser(description="Benchmark grep performance") + parser.add_argument( + "--engine-label", + required=True, + help="Label for this engine config (e.g. fs, auto). Used in output filename.", + ) + parser.add_argument( + "--compare", + default=None, + help="Path to a previous step4_result_*.json file for side-by-side comparison", + ) + args = parser.parse_args() + + print("=" * 80) + print(f"Step 4: Grep Performance Benchmark — engine={args.engine_label}") + print("=" * 80) + print() + print("Ensure ov.conf has the desired grep config and the server is restarted.") + print() + + # Run benchmark + results = benchmark_engine(args.engine_label) + + # Save results + output_file = f"step4_result_{args.engine_label}.json" + with open(output_file, "w") as f: + json.dump({"engine_label": args.engine_label, "results": results}, f, indent=2) + print(f"\nResults saved to {output_file}") + + # Print current results table + print() + print(f"{'Label':<50} {'Avg(ms)':>10} {'Min(ms)':>10} {'Max(ms)':>10} {'Matches':>10}") + print("-" * 95) + for r in results: + if "error" in r: + print(f"{r['label']:<50} {'FAILED':>10}") + else: + print( + f"{r['label']:<50} {r['avg_ms']:>10.1f} {r['min_ms']:>10.1f} " + f"{r['max_ms']:>10.1f} {r['matches']:>10}" + ) + print() + + # Compare with previous results + if args.compare: + if not os.path.isfile(args.compare): + print(f"Warning: compare file not found: {args.compare}") + else: + with open(args.compare) as f: + prev = json.load(f) + prev_label = prev.get("engine_label", "previous") + prev_results = prev.get("results", []) + print_comparison(args.engine_label, results, prev_label, prev_results) if __name__ == "__main__": diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step5_retrieval_quality.py b/benchmark/retrieval/grep/vikingdb_bm25/step5_retrieval_quality.py index fbd4e3bef2..06d4f8330e 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/step5_retrieval_quality.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/step5_retrieval_quality.py @@ -19,10 +19,9 @@ is not truncated. No need to test different limit values. Usage: - python3 step5_retrieval_quality.py [--uri URI] [--case-insensitive] [--output FILE] + python3 step5_retrieval_quality.py [--uri URI] """ -import argparse import json import os import re @@ -54,7 +53,7 @@ ] -def run_ov_grep(uri: str, pattern: str, case_insensitive: bool = False) -> tuple[set[str], float]: +def run_ov_grep(uri: str, pattern: str) -> tuple[set[str], float]: """Run `ov grep --output json` and extract matched URIs.""" cmd = OV_CMD + [ "--output", @@ -66,8 +65,6 @@ def run_ov_grep(uri: str, pattern: str, case_insensitive: bool = False) -> tuple "100000", pattern, ] - if case_insensitive: - cmd.insert(cmd.index("grep") + 1, "-i") cmd_str = shlex.join(cmd) print(f" $ {cmd_str}") @@ -80,9 +77,19 @@ def run_ov_grep(uri: str, pattern: str, case_insensitive: bool = False) -> tuple stderr = result.stderr.strip()[:200] raise RuntimeError(f"ov grep failed (exit={result.returncode}): {stderr}") + # Find the JSON line in stdout (skip echo_command output like "cmd: ov grep ...") + json_line = None + for line in result.stdout.strip().splitlines(): + line = line.strip() + if line.startswith("{"): + json_line = line + break + if not json_line: + raise RuntimeError(f"No JSON output from ov grep. stdout: {result.stdout[:200]}") + # Parse JSON response: {"status": "ok", "result": {"matches": [...], ...}} try: - resp = json.loads(result.stdout) + resp = json.loads(json_line) except json.JSONDecodeError as e: raise RuntimeError(f"Failed to parse ov grep output: {e}") from e @@ -101,10 +108,9 @@ def local_path_to_viking_uri(filepath: str) -> str: return "viking://resources/" + rel.replace(os.sep, "/").rstrip("/") -def compute_ground_truth(pattern: str, case_insensitive: bool = False) -> tuple[set[str], float]: +def compute_ground_truth(pattern: str) -> tuple[set[str], float]: """Scan local benchmark files with Python regex to get ground truth.""" - flags = re.IGNORECASE if case_insensitive else 0 - compiled = re.compile(pattern, flags) + compiled = re.compile(pattern) truth_uris = set() t0 = time.monotonic() for root, dirs, files in os.walk(BENCHMARK_DIR): @@ -141,13 +147,7 @@ def compute_metrics(truth: set[str], predicted: set[str]) -> dict: def main(): - parser = argparse.ArgumentParser(description="Step 5: Retrieval quality evaluation") - parser.add_argument("--uri", default=BASE_URI, help=f"Base URI to search (default: {BASE_URI})") - parser.add_argument("--case-insensitive", action="store_true", help="Case-insensitive matching") - parser.add_argument( - "--output", default=None, help="Output JSON file path (default: print to stdout only)" - ) - args = parser.parse_args() + uri = BASE_URI if not os.path.isdir(BENCHMARK_DIR): print(f"Error: Benchmark data not found at {BENCHMARK_DIR}") @@ -157,9 +157,8 @@ def main(): print("=" * 100) print("Retrieval Quality Evaluation: auto (bm25+fs) vs local fs (ground truth)") print("=" * 100) - print(f"URI: {args.uri}") - print(f"Case insensitive: {args.case_insensitive}") - print(f"Data dir: {BENCHMARK_DIR}") + print(f"URI: {uri}") + print(f"Data dir: {BENCHMARK_DIR}") print() print("Ensure ov.conf has:") print(' "grep": {"engine": "auto", "switch_to_remote_threshold": 0}') @@ -172,12 +171,12 @@ def main(): print(f"--- {label} (pattern: {pattern}) ---") # Ground truth: scan local files - truth_uris, fs_elapsed = compute_ground_truth(pattern, args.case_insensitive) + truth_uris, fs_elapsed = compute_ground_truth(pattern) print(f" Ground truth (local fs): {len(truth_uris)} matches ({fs_elapsed:.2f}s)") # Auto grep (via ov CLI with --output json) try: - auto_uris, auto_elapsed = run_ov_grep(args.uri, pattern, args.case_insensitive) + auto_uris, auto_elapsed = run_ov_grep(uri, pattern) except Exception as e: print(f" Auto grep FAILED: {e}") results.append( @@ -262,12 +261,6 @@ def main(): ) print() - # Save results - if args.output: - with open(args.output, "w") as f: - json.dump(results, f, indent=2) - print(f"Results saved to {args.output}") - if __name__ == "__main__": main() From f9b4065da04372c9789aaf1635093440dd854a89 Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Wed, 27 May 2026 19:18:51 +0800 Subject: [PATCH 12/31] fix(grep): store full content for BM25; use PathScope depth; reduce redundant API calls --- openviking/core/context.py | 4 +- openviking/server/routers/search.py | 2 +- openviking/service/fs_service.py | 2 +- openviking/service/reindex_executor.py | 6 ++- .../queuefs/embedding_msg_converter.py | 6 ++- openviking/storage/vectordb_adapters/base.py | 13 ++++- openviking/storage/viking_fs.py | 54 ++++++++++++------- openviking/utils/embedding_utils.py | 27 +++++++--- tests/storage/test_viking_fs_grep.py | 8 +-- 9 files changed, 84 insertions(+), 38 deletions(-) diff --git a/openviking/core/context.py b/openviking/core/context.py index e43d3c3fa5..33691191b4 100644 --- a/openviking/core/context.py +++ b/openviking/core/context.py @@ -41,12 +41,14 @@ class ContextLevel(int, Enum): class Vectorize: text: str = "" + full_text: str = "" # Full content for BM25 (not embedding-truncated) # image: str = "" # video: str = "" # audio: str = "" - def __init__(self, text: str = ""): + def __init__(self, text: str = "", full_text: str = ""): self.text = text + self.full_text = full_text class Context: diff --git a/openviking/server/routers/search.py b/openviking/server/routers/search.py index c61d81e853..9e804a0bd9 100644 --- a/openviking/server/routers/search.py +++ b/openviking/server/routers/search.py @@ -110,7 +110,7 @@ class GrepRequest(BaseModel): pattern: str case_insensitive: bool = False node_limit: Optional[int] = None - level_limit: int = 5 + level_limit: int = 10 remote_return_limit: int = Field( default=0, ge=0, diff --git a/openviking/service/fs_service.py b/openviking/service/fs_service.py index b42de2b3b1..eca8cac45c 100644 --- a/openviking/service/fs_service.py +++ b/openviking/service/fs_service.py @@ -242,7 +242,7 @@ async def grep( exclude_uri: Optional[str] = None, case_insensitive: bool = False, node_limit: Optional[int] = None, - level_limit: int = 5, + level_limit: int = 10, remote_return_limit: int = 0, ) -> Dict: """Content search.""" diff --git a/openviking/service/reindex_executor.py b/openviking/service/reindex_executor.py index 1ec2099685..9718a7ff18 100644 --- a/openviking/service/reindex_executor.py +++ b/openviking/service/reindex_executor.py @@ -664,12 +664,15 @@ async def _reindex_resource_vectors_from_entries( counters.warnings.append(f"No vector source found for {file_uri}") continue abstract = self._prefer_non_empty(summary, vector_text) + # Read full file content for BM25 content field (not embedding-truncated) + full_text = await self._safe_read_text(file_uri, ctx=ctx) or vector_text try: await self._upsert_context( uri=file_uri, parent_uri=parent_uri, abstract=abstract, vector_text=vector_text, + full_text=full_text, is_leaf=True, context_type=context_type_for_uri(file_uri), level=ContextLevel.DETAIL, @@ -1302,6 +1305,7 @@ async def _upsert_context( parent_uri: str, abstract: str, vector_text: str, + full_text: str = "", is_leaf: bool, context_type: str, level: ContextLevel, @@ -1323,7 +1327,7 @@ async def _upsert_context( owner_space=owner_space_for_uri(uri, ctx), meta=meta or {}, ) - context.set_vectorize(Vectorize(text=vector_text)) + context.set_vectorize(Vectorize(text=vector_text, full_text=full_text or vector_text)) msg = EmbeddingMsgConverter.from_context(context) if msg is None: raise OpenVikingError( diff --git a/openviking/storage/queuefs/embedding_msg_converter.py b/openviking/storage/queuefs/embedding_msg_converter.py index 6f983546f9..6cda82b4e4 100644 --- a/openviking/storage/queuefs/embedding_msg_converter.py +++ b/openviking/storage/queuefs/embedding_msg_converter.py @@ -68,9 +68,11 @@ def from_context(context: Context) -> EmbeddingMsg: resolved_level = int(resolved_level.value) context_data["level"] = int(resolved_level) - # Store vectorization text in content field for bm25 full-text search. + # Store full content in content field for bm25 full-text search. + # Use full_text (raw file content) when available; fall back to vectorization_text. # Truncate to 64KB (VikingDB text field limit). - context_data["content"] = vectorization_text[:65536] + full_content = context.vectorize.full_text or vectorization_text + context_data["content"] = full_content[:65536] embedding_msg = EmbeddingMsg( message=vectorization_text, diff --git a/openviking/storage/vectordb_adapters/base.py b/openviking/storage/vectordb_adapters/base.py index f82275029a..3733a0ebad 100644 --- a/openviking/storage/vectordb_adapters/base.py +++ b/openviking/storage/vectordb_adapters/base.py @@ -4,6 +4,7 @@ from __future__ import annotations +import json import math import uuid from abc import ABC, abstractmethod @@ -533,13 +534,23 @@ def search_by_keywords( output_fields: Optional[list[str]] = None, ) -> list[Dict[str, Any]]: coll = self.get_collection() + compiled_filter = self._compile_filter(filter) + logger.debug( + "search_by_keywords: keywords=%s query=%s limit=%s offset=%s filter=%s output_fields=%s", + keywords, + query, + limit, + offset, + json.dumps(compiled_filter, ensure_ascii=False), + output_fields, + ) result = coll.search_by_keywords( index_name=self._index_name, keywords=keywords, query=query, limit=limit, offset=offset, - filters=self._compile_filter(filter), + filters=compiled_filter, output_fields=output_fields, ) records: list[Dict[str, Any]] = [] diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index 9952d90111..759916ae08 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -42,7 +42,7 @@ from openviking.resource.watch_storage import is_watch_task_control_uri from openviking.server.error_mapping import is_not_found_error, map_exception from openviking.server.identity import RequestContext, Role -from openviking.storage.expr import And, Eq, PathScope +from openviking.storage.expr import PathScope from openviking.telemetry import get_current_telemetry from openviking.utils.time_utils import format_simplified, get_current_timestamp, parse_iso_datetime from openviking_cli.exceptions import ( @@ -235,6 +235,7 @@ def __init__( self._encryptor = encryptor self._count_cache: Dict[str, tuple] = {} # cache_key → (count, timestamp) self._count_cache_max_size = 1024 + self._fulltext_available: Optional[bool] = None # cached result of _collection_has_fulltext self._bound_ctx: contextvars.ContextVar[Optional[RequestContext]] = contextvars.ContextVar( "vikingfs_bound_ctx", default=None ) @@ -692,7 +693,7 @@ async def grep( exclude_uri: Optional[str] = None, case_insensitive: bool = False, node_limit: Optional[int] = None, - level_limit: int = 5, + level_limit: int = 10, ctx: Optional[RequestContext] = None, remote_return_limit: int = 0, ) -> Dict: @@ -719,7 +720,9 @@ async def grep( Dict with matches, count, match_count, files_scanned """ self._ensure_access(uri, ctx) - await self.stat(uri, ctx=ctx) + # Skip vector_store.count() — the count field is not needed for grep, + # and avoiding it saves one VikingDB API call. + await self.stat(uri, ctx=ctx, skip_count=True) # Clamp remote_return_limit to valid range (0 = auto, 1-100000 = explicit) if remote_return_limit < 0: @@ -797,12 +800,19 @@ async def _resolve_grep_engine( return "vikingdb_then_fs" async def _collection_has_fulltext(self, vector_store, ctx) -> bool: - """Check if collection has content field and FullText config.""" + """Check if collection has content field and FullText config. + + Result is cached on the VikingFS instance since collection schema + does not change at runtime. + """ + if self._fulltext_available is not None: + return self._fulltext_available try: meta = None if hasattr(vector_store, "get_collection_meta"): meta = await vector_store.get_collection_meta(ctx=ctx) if not meta: + self._fulltext_available = False return False fields = meta.get("Fields", []) has_content = any( @@ -810,7 +820,9 @@ async def _collection_has_fulltext(self, vector_store, ctx) -> bool: ) fulltext = meta.get("FullText") or [] has_content_fulltext = any(ft.get("Field") == "content" for ft in fulltext) - return has_content and has_content_fulltext + result = has_content and has_content_fulltext + self._fulltext_available = result + return result except Exception: logger.debug( "Failed to check collection fulltext config, assuming no fulltext", exc_info=True @@ -818,8 +830,8 @@ async def _collection_has_fulltext(self, vector_store, ctx) -> bool: return False async def _get_cached_count(self, uri: str, ctx) -> int: - """Get cached count of L2 records for a URI (TTL=60s).""" - _COUNT_CACHE_TTL = 60 + """Get cached count of records for a URI (TTL=1h).""" + _COUNT_CACHE_TTL = 3600 vector_store = self._get_vector_store() # Include account_id in cache key for multi-tenant safety @@ -831,9 +843,7 @@ async def _get_cached_count(self, uri: str, ctx) -> int: if cached and (now - cached[1]) < _COUNT_CACHE_TTL: return cached[0] - count = await vector_store.count( - filter=And([PathScope("uri", uri), Eq("level", 2)]), ctx=ctx - ) + count = await vector_store.count(filter=PathScope("uri", uri, depth=-1), ctx=ctx) # Evict oldest entries if cache exceeds max size if len(self._count_cache) >= self._count_cache_max_size: oldest_keys = sorted(self._count_cache, key=lambda k: self._count_cache[k][1]) @@ -896,12 +906,7 @@ async def _grep_vikingdb_then_fs( # Split regex alternation (e.g. "error|warning|fail") into individual keywords # for bm25 search. Limit to 10 keywords per VikingDB API constraint. keywords = [kw.strip() for kw in pattern.split("|") if kw.strip()][:10] - filter_expr = And( - [ - PathScope("uri", uri), - Eq("level", 2), - ] - ) + filter_expr = PathScope("uri", uri, depth=level_limit) # Auto-adapt remote_return_limit: when 0 (default), use the maximum # limit (100000) so that search_by_keywords returns all matching @@ -995,7 +1000,7 @@ async def _grep_with_agfs( exclude_uri: Optional[str] = None, case_insensitive: bool = False, node_limit: Optional[int] = None, - level_limit: int = 5, + level_limit: int = 10, ctx: Optional[RequestContext] = None, ) -> Dict: """Grep using agfs native implementation. @@ -1091,7 +1096,7 @@ async def _grep_encrypted( exclude_uri: Optional[str] = None, case_insensitive: bool = False, node_limit: Optional[int] = None, - level_limit: int = 5, + level_limit: int = 10, ctx: Optional[RequestContext] = None, ) -> Dict: """Grep implementation for encrypted files. @@ -1256,7 +1261,9 @@ def _calculate_grep_match_depth(self, match_file: str) -> int: return 0 return len([part for part in match_file.split("/") if part]) - async def stat(self, uri: str, ctx: Optional[RequestContext] = None) -> Dict[str, Any]: + async def stat( + self, uri: str, ctx: Optional[RequestContext] = None, skip_count: bool = False + ) -> Dict[str, Any]: """ File/directory information. @@ -1270,6 +1277,13 @@ async def stat(self, uri: str, ctx: Optional[RequestContext] = None) -> Dict[str count (int): For directories, the number of nodes in the vector index under this directory (including subdirectories). For files, this field is not included. + + Args: + uri: Viking URI + ctx: Request context + skip_count: If True, skip the vector_store.count() call for directories. + Use this when the count field is not needed (e.g. in grep) to avoid + an extra VikingDB API call. """ self._ensure_access(uri, ctx) path = self._uri_to_path(uri, ctx=ctx) @@ -1277,7 +1291,7 @@ async def stat(self, uri: str, ctx: Optional[RequestContext] = None) -> Dict[str if isinstance(result, dict): result["isLocked"] = await self._is_path_locked_async(path) # Add count for directories if vector store available - if result.get("isDir", False): + if not skip_count and result.get("isDir", False): try: vector_store = self._get_vector_store() if vector_store: diff --git a/openviking/utils/embedding_utils.py b/openviking/utils/embedding_utils.py index 44c808cbb2..474ae3b707 100644 --- a/openviking/utils/embedding_utils.py +++ b/openviking/utils/embedding_utils.py @@ -16,6 +16,7 @@ from openviking.storage.queuefs import get_queue_manager from openviking.storage.queuefs.embedding_msg_converter import EmbeddingMsgConverter from openviking.storage.viking_fs import get_viking_fs +from openviking.utils.embedding_input import truncate_embedding_input from openviking.utils.time_utils import parse_iso_datetime from openviking_cli.utils import VikingURI, get_logger from openviking_cli.utils.config import get_openviking_config @@ -238,7 +239,7 @@ async def vectorize_directory_meta( account_id=ctx.account_id, owner_space=owner_space, ) - context_abstract.set_vectorize(Vectorize(text=abstract)) + context_abstract.set_vectorize(Vectorize(text=abstract, full_text=abstract)) msg_abstract = EmbeddingMsgConverter.from_context(context_abstract) _apply_scalar_overrides( msg_abstract, @@ -271,7 +272,7 @@ async def vectorize_directory_meta( account_id=ctx.account_id, owner_space=owner_space, ) - context_overview.set_vectorize(Vectorize(text=overview)) + context_overview.set_vectorize(Vectorize(text=overview, full_text=overview)) msg_overview = EmbeddingMsgConverter.from_context(context_overview) _apply_scalar_overrides( msg_overview, @@ -353,6 +354,7 @@ async def vectorize_file( embedding_cfg = get_openviking_config().embedding configured_text_source = getattr(embedding_cfg, "text_source", "content_only") effective_text_source = "summary_only" if use_summary else configured_text_source + max_input_tokens = int(getattr(embedding_cfg, "max_input_tokens", 4096) or 4096) if content_type is None: # Unsupported file type: fall back to summary if available @@ -360,7 +362,7 @@ async def vectorize_file( logger.warning( f"Unsupported file type for {file_path}, falling back to summary for vectorization" ) - context.set_vectorize(Vectorize(text=summary)) + context.set_vectorize(Vectorize(text=summary, full_text=summary)) else: logger.warning( f"Unsupported file type for {file_path} and no summary available, skipping vectorization" @@ -368,20 +370,31 @@ async def vectorize_file( return elif content_type == ResourceContentType.TEXT: if summary and effective_text_source in {"summary_first", "summary_only"}: - context.set_vectorize(Vectorize(text=summary)) + # Use summary for vectorization, but store full file content for BM25. + full_content = "" + try: + raw = await viking_fs.read_file(file_path, ctx=ctx) + if isinstance(raw, bytes): + raw = raw.decode("utf-8", errors="replace") + full_content = raw + except Exception: + pass + context.set_vectorize(Vectorize(text=summary, full_text=full_content or summary)) else: # Read raw file content; embedders apply their own input guard. try: content = await viking_fs.read_file(file_path, ctx=ctx) if isinstance(content, bytes): content = content.decode("utf-8", errors="replace") - context.set_vectorize(Vectorize(text=content)) + full_content = content + content = truncate_embedding_input(content, max_input_tokens) + context.set_vectorize(Vectorize(text=content, full_text=full_content)) except Exception as e: logger.warning( f"Failed to read file content for {file_path}, falling back to summary: {e}" ) if summary: - context.set_vectorize(Vectorize(text=summary)) + context.set_vectorize(Vectorize(text=summary, full_text=summary)) else: logger.warning( f"No summary available for {file_path}, skipping vectorization" @@ -389,7 +402,7 @@ async def vectorize_file( return elif summary: # For non-text files, use summary - context.set_vectorize(Vectorize(text=summary)) + context.set_vectorize(Vectorize(text=summary, full_text=summary)) else: logger.debug(f"Skipping file {file_path} (no text content or summary)") return diff --git a/tests/storage/test_viking_fs_grep.py b/tests/storage/test_viking_fs_grep.py index 8fea8283b9..0f55baaf19 100644 --- a/tests/storage/test_viking_fs_grep.py +++ b/tests/storage/test_viking_fs_grep.py @@ -17,7 +17,7 @@ class _DummyAgfs: async def test_grep_preserves_dfs_order_and_node_limit(monkeypatch): fs = VikingFS(agfs=_DummyAgfs()) - async def fake_stat(uri, ctx=None): + async def fake_stat(uri, ctx=None, skip_count=False): return {"isDir": True} async def fake_ls(uri, ctx=None, **kwargs): @@ -80,7 +80,7 @@ def fake_agfs_read(path, offset=0, size=-1): async def test_grep_parallel_reads_respect_concurrency_limit(monkeypatch): fs = VikingFS(agfs=_DummyAgfs()) - async def fake_stat(uri, ctx=None): + async def fake_stat(uri, ctx=None, skip_count=False): return {"isDir": True} async def fake_ls(uri, ctx=None, **kwargs): @@ -121,7 +121,7 @@ def fake_agfs_read(path, offset=0, size=-1): async def test_grep_parallel_reads_work_with_blocking_agfs_read(monkeypatch): fs = VikingFS(agfs=_DummyAgfs()) - async def fake_stat(uri, ctx=None): + async def fake_stat(uri, ctx=None, skip_count=False): return {"isDir": True} async def fake_ls(uri, ctx=None, **kwargs): @@ -155,7 +155,7 @@ def fake_agfs_read(path, offset=0, size=-1): async def test_grep_stops_scheduling_later_batches_after_node_limit(monkeypatch): fs = VikingFS(agfs=_DummyAgfs()) - async def fake_stat(uri, ctx=None): + async def fake_stat(uri, ctx=None, skip_count=False): return {"isDir": True} async def fake_ls(uri, ctx=None, **kwargs): From 240fd27457ff7bac9b047cd808da7db6281bae50 Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Thu, 28 May 2026 20:38:30 +0800 Subject: [PATCH 13/31] refactor: new benchmark --- .../grep/vikingdb_bm25/step1_add_resource.py | 112 ++++++++ .../grep/vikingdb_bm25/step1_generate.py | 166 ----------- ...{step4_benchmark.py => step2_benchmark.py} | 134 +++++---- .../vikingdb_bm25/step2_quick_add_resource.py | 123 -------- .../grep/vikingdb_bm25/step3_build_index.py | 181 ------------ .../vikingdb_bm25/step3_retrieval_quality.py | 258 +++++++++++++++++ .../vikingdb_bm25/step5_retrieval_quality.py | 266 ------------------ 7 files changed, 434 insertions(+), 806 deletions(-) create mode 100644 benchmark/retrieval/grep/vikingdb_bm25/step1_add_resource.py delete mode 100644 benchmark/retrieval/grep/vikingdb_bm25/step1_generate.py rename benchmark/retrieval/grep/vikingdb_bm25/{step4_benchmark.py => step2_benchmark.py} (59%) delete mode 100644 benchmark/retrieval/grep/vikingdb_bm25/step2_quick_add_resource.py delete mode 100644 benchmark/retrieval/grep/vikingdb_bm25/step3_build_index.py create mode 100644 benchmark/retrieval/grep/vikingdb_bm25/step3_retrieval_quality.py delete mode 100644 benchmark/retrieval/grep/vikingdb_bm25/step5_retrieval_quality.py diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step1_add_resource.py b/benchmark/retrieval/grep/vikingdb_bm25/step1_add_resource.py new file mode 100644 index 0000000000..81f18d1ab3 --- /dev/null +++ b/benchmark/retrieval/grep/vikingdb_bm25/step1_add_resource.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +"""Step 1: Import GitHub repositories as benchmark data via OpenViking SDK. + +Imports one or more GitHub repositories into OpenViking using the Python SDK +(SyncOpenViking) to avoid HTTP timeout issues with the CLI. Resources are +added with wait=True so VLM summarization and embedding are completed +before the script returns. + +Usage: + python3 step1_add_resource.py + python3 step1_add_resource.py --repos "https://github.com/volcengine/OpenViking" + python3 step1_add_resource.py --repos "https://github.com/volcengine/OpenViking" "https://github.com/another/repo" +""" + +from __future__ import annotations + +import argparse +import time + +from openviking.sync_client import SyncOpenViking + +DEFAULT_REPOS = [ + "https://github.com/volcengine/OpenViking", +] + +BENCHMARK_PARENT = "viking://resources/benchmark" + + +def main(): + parser = argparse.ArgumentParser( + description="Step 1: Import GitHub repos as benchmark data via SDK" + ) + parser.add_argument( + "--repos", + nargs="+", + default=DEFAULT_REPOS, + help="GitHub repo URLs to import (default: OpenViking repo)", + ) + parser.add_argument( + "--parent", + default=BENCHMARK_PARENT, + help=f"Parent Viking URI (default: {BENCHMARK_PARENT})", + ) + args = parser.parse_args() + + print("=" * 80) + print("Step 1: Import GitHub Repositories as Benchmark Data") + print("=" * 80) + print(f" Repos: {args.repos}") + print(f" Parent: {args.parent}") + print() + + client = SyncOpenViking() + client.initialize() + + results = [] + for repo_url in args.repos: + repo_name = repo_url.rstrip("/").split("/")[-1] + print(f"--- Importing {repo_name} ---") + + t0 = time.monotonic() + try: + result = client.add_resource( + path=repo_url, + parent=args.parent, + reason=f"benchmark data: {repo_name}", + wait=True, + create_parent=True, + ) + elapsed = time.monotonic() - t0 + root_uri = result.get("root_uri", "?") + print(f" OK ({elapsed:.1f}s) -> {root_uri}") + results.append( + { + "repo": repo_url, + "status": "ok", + "elapsed_s": round(elapsed, 1), + "root_uri": root_uri, + } + ) + except Exception as e: + elapsed = time.monotonic() - t0 + print(f" FAILED ({elapsed:.1f}s): {e}") + results.append( + { + "repo": repo_url, + "status": "failed", + "elapsed_s": round(elapsed, 1), + "error": str(e)[:500], + } + ) + + client.close() + + print() + print("Summary:") + for r in results: + status = r["status"] + repo = r["repo"] + elapsed = r["elapsed_s"] + print(f" {status.upper():>7s} {repo} ({elapsed}s)") + + ok_count = sum(1 for r in results if r["status"] == "ok") + if ok_count == len(results): + print(f"\nAll {ok_count} repos imported and processed successfully.") + print("Next step: run step2_benchmark.py to measure grep performance") + else: + print(f"\n{ok_count}/{len(results)} repos imported successfully. Check errors above.") + + +if __name__ == "__main__": + main() diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step1_generate.py b/benchmark/retrieval/grep/vikingdb_bm25/step1_generate.py deleted file mode 100644 index d69717add6..0000000000 --- a/benchmark/retrieval/grep/vikingdb_bm25/step1_generate.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -"""Generate benchmark data for grep bm25 vs fs comparison. - -Produces ~80,000 markdown files (~50KB each, ~4GB total) in a 4-level -directory tree: - level0: 10 dirs - level1: 10 dirs per level0 (100 total) - level2: 10 dirs per level1 (1,000 total) - level3: 8 dirs per level2 (8,000 total) - files: 10 per level3 dir (80,000 total) - -Target keywords appear in ~1% of files each, simulating a realistic -large-scale codebase where bm25 recall dramatically reduces search scope. -""" - -import os -import random - -BASE_DIR = os.path.expanduser("~/.openviking/data/benchmark") - -# Directory tree — each level has independent dir count -LEVEL0_DIRS = 10 -LEVEL1_DIRS = 10 # per level0 dir -LEVEL2_DIRS = 10 # per level1 dir -LEVEL3_DIRS = 8 # per level2 dir (flexible) -FILES_PER_DIR = 10 # per level3 dir - -# Total: 10 * 10 * 10 * 8 * 10 = 80,000 files -# Size: 80,000 * 50KB ≈ 4GB -# Each top-level dir: 10*10*8*10*50KB = 400MB - -TARGET_FILE_SIZE = 50000 # ~50KB - -TARGET_KEYWORDS = ["VikingDB", "FullText", "bm25", "search_by_keywords"] - -FILLER_WORDS = [ - "configuration", - "deployment", - "architecture", - "implementation", - "performance", - "optimization", - "integration", - "middleware", - "authentication", - "authorization", - "encryption", - "validation", - "monitoring", - "logging", - "caching", - "serialization", - "concurrency", - "scalability", - "reliability", - "observability", - "throughput", - "latency", - "availability", - "consistency", - "partitioning", - "replication", - "failover", - "loadbalancing", - "containerization", - "orchestration", - "provisioning", - "lifecycle", -] - -random.seed(42) - -total_files = LEVEL0_DIRS * LEVEL1_DIRS * LEVEL2_DIRS * LEVEL3_DIRS * FILES_PER_DIR -keyword_hit_count = max(1, total_files // 100) # 1% = 800 files per keyword -file_indices = list(range(total_files)) -keyword_files = {} -for kw in TARGET_KEYWORDS: - chosen = random.sample(file_indices, keyword_hit_count) - keyword_files[kw] = set(chosen) - - -def generate_section(title_level): - """Generate a markdown section with realistic filler content.""" - prefix = "#" * title_level - title_words = random.sample(FILLER_WORDS, 3) - title = f"{prefix} {' '.join(title_words).title()}\n\n" - - paragraphs = [] - for _ in range(random.randint(2, 5)): - sentences = [] - for _ in range(random.randint(3, 8)): - words = random.choices(FILLER_WORDS, k=random.randint(8, 15)) - sentences.append(" ".join(words).capitalize() + ".") - paragraphs.append(" ".join(sentences)) - - return title + "\n\n".join(paragraphs) + "\n\n" - - -def generate_file(file_idx): - """Generate a ~50KB markdown file with 3-5 h1 sections, each with 5-10 h2 sections.""" - parts = [] - num_h1 = random.randint(3, 5) - for _ in range(num_h1): - parts.append(generate_section(1)) - num_h2 = random.randint(5, 10) - for _ in range(num_h2): - parts.append(generate_section(2)) - - # Inject target keyword if this file is selected - for kw, indices in keyword_files.items(): - if file_idx in indices: - injection = ( - f"\nThis module provides {kw} integration for advanced search capabilities. " - f"The {kw} feature enables efficient keyword-based retrieval across large datasets.\n\n" - ) - parts[2] = parts[2] + injection # after first h1 + first h2 - - content = "".join(parts) - # Pad to target size if needed - if len(content) < TARGET_FILE_SIZE: - padding_parts = [] - while len("".join(padding_parts)) < TARGET_FILE_SIZE - len(content): - words = random.choices(FILLER_WORDS, k=20) - padding_parts.append(" ".join(words).capitalize() + ".\n") - content += "\n\n## Appendix\n\n" + "".join(padding_parts) - - return content[:TARGET_FILE_SIZE] - - -print(f"Generating {total_files} markdown files under {BASE_DIR}...") -print( - f" Tree: level0={LEVEL0_DIRS} x level1={LEVEL1_DIRS} x level2={LEVEL2_DIRS} x level3={LEVEL3_DIRS}" -) -print(f" Files per leaf dir: {FILES_PER_DIR}") -print(f" Target keywords: {TARGET_KEYWORDS}") -print( - f" Each keyword appears in ~{keyword_hit_count} files out of {total_files} " - f"(~{keyword_hit_count / total_files * 100:.1f}%)" -) -print(f" Estimated total size: ~{total_files * TARGET_FILE_SIZE / 1e9:.1f} GB") - -file_idx = 0 -os.makedirs(BASE_DIR, exist_ok=True) - -for i0 in range(LEVEL0_DIRS): - d0 = os.path.join(BASE_DIR, f"level0_{i0:02d}") - os.makedirs(d0, exist_ok=True) - for i1 in range(LEVEL1_DIRS): - d1 = os.path.join(d0, f"level1_{i1:02d}") - os.makedirs(d1, exist_ok=True) - for i2 in range(LEVEL2_DIRS): - d2 = os.path.join(d1, f"level2_{i2:02d}") - os.makedirs(d2, exist_ok=True) - for i3 in range(LEVEL3_DIRS): - d3 = os.path.join(d2, f"level3_{i3:02d}") - os.makedirs(d3, exist_ok=True) - for f in range(FILES_PER_DIR): - filepath = os.path.join(d3, f"doc_{f:04d}.md") - content = generate_file(file_idx) - with open(filepath, "w") as fh: - fh.write(content) - file_idx += 1 - if file_idx % 10000 == 0: - print(f" ... {file_idx} files written") - -print(f"Done! {file_idx} files generated under {BASE_DIR}") diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step4_benchmark.py b/benchmark/retrieval/grep/vikingdb_bm25/step2_benchmark.py similarity index 59% rename from benchmark/retrieval/grep/vikingdb_bm25/step4_benchmark.py rename to benchmark/retrieval/grep/vikingdb_bm25/step2_benchmark.py index b45f068a5e..8d7519d2a1 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/step4_benchmark.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/step2_benchmark.py @@ -1,116 +1,106 @@ #!/usr/bin/env python3 -"""Step 4: Benchmark grep performance for the current engine config. +"""Step 2: Benchmark grep performance for the current engine config. Prerequisites: - 1. Run step1_generate.py to create test data - 2. Run step2_quick_add_resource.py to upload files (skip VLM+embedding) - 3. Run step3_build_index.py to build index (embedding + content) - 4. Set ov.conf grep config and restart the server + 1. Run step1_add_resource.py to import repos (includes VLM+embedding) + 2. Set ov.conf grep config and restart the server NOTE: `engine` and `switch_to_remote_threshold` are server-side config (ov.conf `grep` section). To benchmark different engines, update ov.conf and restart the server before each run. +KEYWORDS: Fill the KEYWORDS list below with real terms from the imported +repos. Each keyword will be tested individually, plus multi-keyword regex +and no-match scenarios. + Usage: # Run 1: benchmark with fs engine # 1. Set ov.conf: "grep": {"engine": "fs"} # 2. Restart server - python3 step4_benchmark.py --engine-label fs + python3 step2_benchmark.py --engine-label fs # Run 2: benchmark with auto engine (bm25) # 1. Set ov.conf: "grep": {"engine": "auto", "switch_to_remote_threshold": 0} # 2. Restart server - python3 step4_benchmark.py --engine-label auto --compare step4_result_fs.json + python3 step2_benchmark.py --engine-label auto --compare step2_result_fs.json -Results are saved to step4_result_{engine_label}.json. +Results are saved to step2_result_{engine_label}.json. When --compare is given, a side-by-side comparison table is printed. """ +from __future__ import annotations + import argparse import json import os -import subprocess import time +from openviking.sync_client import SyncOpenViking + BASE_URI = "viking://resources/benchmark" -OV_CMD = ["ov", "--account", "default", "--user", "default"] RUNS = 3 WARMUP = 1 -# Test cases: (label, pattern, uri) -TEST_CASES = [ - # --- Single keyword --- - ("single keyword (VikingDB)", "VikingDB", BASE_URI), - ("single keyword (FullText)", "FullText", BASE_URI), - # --- Multi-keyword (regex alternation) --- - ("2 keywords (VikingDB|FullText)", "VikingDB|FullText", BASE_URI), - ("3 keywords (VikingDB|FullText|bm25)", "VikingDB|FullText|bm25", BASE_URI), - # --- Rare keyword (lower hit count) --- - ("rare keyword (search_by_keywords)", "search_by_keywords", BASE_URI), - # --- Non-existent keyword (0 matches) --- - ("no-match 1 keyword (zzz_nonexistent)", "zzz_nonexistent", BASE_URI), - ("no-match 2 keywords (zzz_a|zzz_b)", "zzz_a|zzz_b", BASE_URI), - ("no-match 3 keywords (zzz_a|zzz_b|zzz_c)", "zzz_a|zzz_b|zzz_c", BASE_URI), - # --- Subdirectory scope (~8K files per level0 dir) --- - ("subdir level0_00, VikingDB (~8K files)", "VikingDB", f"{BASE_URI}/level0_00"), - ("subdir level0_00, no-match (~8K files)", "zzz_nonexistent", f"{BASE_URI}/level0_00"), -] - - -def run_grep(pattern: str, uri: str) -> tuple[float, int]: - """Run a single grep command, return (elapsed_seconds, match_count).""" - cmd = OV_CMD + ["--output", "json", "grep", "--uri", uri, "-n", "100000", pattern] +KEYWORDS: list[str] = [] - start = time.monotonic() - result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) - elapsed = time.monotonic() - start - if result.returncode != 0: - raise RuntimeError( - f"ov grep failed (exit={result.returncode}): {result.stderr.strip()[:200]}" +def build_test_cases() -> list[tuple[str, str, str]]: + cases = [] + + for kw in KEYWORDS: + cases.append((f"keyword: {kw}", kw, BASE_URI)) + + if len(KEYWORDS) >= 2: + cases.append( + (f"multi 2: {KEYWORDS[0]}|{KEYWORDS[1]}", f"{KEYWORDS[0]}|{KEYWORDS[1]}", BASE_URI) + ) + if len(KEYWORDS) >= 3: + cases.append( + ( + f"multi 3: {KEYWORDS[0]}|{KEYWORDS[1]}|{KEYWORDS[2]}", + f"{KEYWORDS[0]}|{KEYWORDS[1]}|{KEYWORDS[2]}", + BASE_URI, + ) ) - # Find JSON line in stdout (skip echo_command output) - json_line = None - for line in result.stdout.strip().splitlines(): - line = line.strip() - if line.startswith("{"): - json_line = line - break + cases.append(("no-match: zzz_nonexistent_benchmark", "zzz_nonexistent_benchmark", BASE_URI)) + cases.append(("no-match 2: zzz_a|zzz_b", "zzz_a|zzz_b", BASE_URI)) + + return cases + + +def run_grep(client: SyncOpenViking, pattern: str, uri: str) -> tuple[float, int]: + start = time.monotonic() + result = client.grep(uri=uri, pattern=pattern, node_limit=100000) + elapsed = time.monotonic() - start match_count = 0 - if json_line: - try: - resp = json.loads(json_line) - grep_result = resp.get("result", resp) - match_count = len(grep_result.get("matches", [])) - except json.JSONDecodeError: - pass + if isinstance(result, dict): + matches = result.get("matches", []) + match_count = len(matches) return elapsed, match_count -def benchmark_engine(engine_label: str) -> list[dict]: - """Run all test cases for the current engine config.""" +def benchmark_engine(client: SyncOpenViking, engine_label: str) -> list[dict]: + test_cases = build_test_cases() results = [] - for label, pattern, uri in TEST_CASES: + for label, pattern, uri in test_cases: print(f" {label} ...", end=" ", flush=True) - # Warmup for _ in range(WARMUP): try: - run_grep(pattern, uri) + run_grep(client, pattern, uri) except Exception: pass - # Measured runs times = [] match_count = 0 failed = False for _ in range(RUNS): try: - elapsed, matches = run_grep(pattern, uri) + elapsed, matches = run_grep(client, pattern, uri) times.append(elapsed) match_count = matches except Exception as e: @@ -143,8 +133,6 @@ def benchmark_engine(engine_label: str) -> list[dict]: def print_comparison( current_label: str, current: list[dict], compare_label: str, compare: list[dict] ): - """Print side-by-side comparison table.""" - # Build lookup by label compare_by_label = {} for r in compare: if "error" not in r: @@ -190,27 +178,34 @@ def main(): parser.add_argument( "--compare", default=None, - help="Path to a previous step4_result_*.json file for side-by-side comparison", + help="Path to a previous step2_result_*.json file for side-by-side comparison", ) args = parser.parse_args() + if not KEYWORDS: + print("WARNING: KEYWORDS list is empty. Fill it with real terms before running.") + print(" Edit step2_benchmark.py and add keywords to the KEYWORDS list.\n") + print("=" * 80) - print(f"Step 4: Grep Performance Benchmark — engine={args.engine_label}") + print(f"Step 2: Grep Performance Benchmark — engine={args.engine_label}") print("=" * 80) print() print("Ensure ov.conf has the desired grep config and the server is restarted.") print() - # Run benchmark - results = benchmark_engine(args.engine_label) + client = SyncOpenViking() + client.initialize() + + try: + results = benchmark_engine(client, args.engine_label) + finally: + client.close() - # Save results - output_file = f"step4_result_{args.engine_label}.json" + output_file = f"step2_result_{args.engine_label}.json" with open(output_file, "w") as f: json.dump({"engine_label": args.engine_label, "results": results}, f, indent=2) print(f"\nResults saved to {output_file}") - # Print current results table print() print(f"{'Label':<50} {'Avg(ms)':>10} {'Min(ms)':>10} {'Max(ms)':>10} {'Matches':>10}") print("-" * 95) @@ -224,7 +219,6 @@ def main(): ) print() - # Compare with previous results if args.compare: if not os.path.isfile(args.compare): print(f"Warning: compare file not found: {args.compare}") diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step2_quick_add_resource.py b/benchmark/retrieval/grep/vikingdb_bm25/step2_quick_add_resource.py deleted file mode 100644 index 6fc1c2e89d..0000000000 --- a/benchmark/retrieval/grep/vikingdb_bm25/step2_quick_add_resource.py +++ /dev/null @@ -1,123 +0,0 @@ -#!/usr/bin/env python3 -"""Step 2: Quick upload — import benchmark files skipping VLM+embedding. - -Walks the benchmark directory and uploads each file via the OpenViking Python SDK -with build_index=False, which skips VLM summarization and embedding. This makes -the upload phase fast and avoids circuit-breaker issues from VLM failures. - -After all files are uploaded, run step3_build_index.py to trigger VLM+embedding -in a controlled batch, then step4_benchmark.py to measure grep performance. - -Supports resume: a progress file (.add_resource_progress) tracks completed files. -If interrupted, re-run to automatically skip already-imported files. - -Usage: - python3 step2_quick_add_resource.py [--no-resume] [--max-failures N] -""" - -import argparse -import os -import sys - -BASE_DIR = os.path.expanduser("~/.openviking/data/benchmark") -DATA_DIR = os.path.expanduser("~/.openviking/data") -PROGRESS_FILE = os.path.join(BASE_DIR, ".add_resource_progress") - - -def load_progress() -> set: - """Load set of already-imported relative paths from progress file.""" - done = set() - if os.path.exists(PROGRESS_FILE): - with open(PROGRESS_FILE) as f: - for line in f: - line = line.strip() - if line: - done.add(line) - return done - - -def save_progress(rel_path: str) -> None: - """Append a completed relative path to the progress file and flush immediately.""" - with open(PROGRESS_FILE, "a") as f: - f.write(rel_path + "\n") - f.flush() - os.fsync(f.fileno()) - - -def main(): - parser = argparse.ArgumentParser( - description="Step 2: Quick upload benchmark files (skip VLM+embedding)" - ) - parser.add_argument( - "--no-resume", action="store_true", help="Disable auto-resume, start from scratch" - ) - parser.add_argument( - "--max-failures", type=int, default=10, help="Abort after N failures (default: 10)" - ) - args = parser.parse_args() - - from openviking.sync_client import SyncOpenViking - - client = SyncOpenViking() - client.initialize() - - # Collect all files first (deterministic order) - all_files = [] - for root, dirs, files in os.walk(BASE_DIR): - dirs.sort() - for fname in sorted(files): - if fname.endswith(".md"): - all_files.append(os.path.join(root, fname)) - - # Load resume state - done_set = set() - if not args.no_resume: - done_set = load_progress() - if done_set: - print(f"Resuming: {len(done_set)} files already imported (from {PROGRESS_FILE})") - - count = 0 - skipped = 0 - failed = 0 - - for filepath in all_files: - rel = os.path.relpath(filepath, DATA_DIR) - rel_dir = os.path.dirname(rel) - parent_uri = f"viking://resources/{rel_dir}" - - # Skip already-imported files - if rel in done_set: - skipped += 1 - continue - - idx = count + skipped + 1 - print(f"[{idx}/{len(all_files)}] Uploading {rel} ...", end=" ", flush=True) - - try: - client.add_resource( - path=filepath, - parent=parent_uri, - build_index=False, - wait=False, - create_parent=True, - ) - print("OK") - save_progress(rel) - except Exception as e: - print(f"FAILED: {e}") - failed += 1 - if failed >= args.max_failures: - print(f"\nToo many failures ({failed}), aborting. Re-run to resume.") - sys.exit(1) - - count += 1 - if count % 100 == 0: - print(f" ... {count} files uploaded this run ({failed} failed, {skipped} skipped)") - - print(f"\nDone! {count} uploaded, {skipped} skipped, {failed} failed") - if failed == 0: - print("Next step: run step3_build_index.py to trigger VLM+embedding") - - -if __name__ == "__main__": - main() diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step3_build_index.py b/benchmark/retrieval/grep/vikingdb_bm25/step3_build_index.py deleted file mode 100644 index 22d229acd4..0000000000 --- a/benchmark/retrieval/grep/vikingdb_bm25/step3_build_index.py +++ /dev/null @@ -1,181 +0,0 @@ -#!/usr/bin/env python3 -"""Step 3: Build index — trigger VLM+embedding on already-uploaded files via CLI. - -After step2_quick_add_resource.py uploads all files with build_index=False (skipping -VLM and embedding), this script calls `ov reindex` on each level3 directory to -trigger VLM summarization and embedding in-place, without re-uploading files. - -Uses level3 directory granularity for progress tracking (8000 dirs, ~10 files each), -which gives fine-grained resume capability. - -Usage: - python3 step3_build_index.py [--no-resume] [--mode MODE] [--max-failures N] -""" - -import argparse -import os -import shlex -import subprocess -import sys -import time - -BASE_DIR = os.path.expanduser("~/.openviking/data/benchmark") -PROGRESS_FILE = os.path.join(BASE_DIR, ".build_index_progress") -BENCHMARK_URI = "viking://resources/benchmark" - -# Tree structure from step1_generate.py -LEVEL0_DIRS = 10 -LEVEL1_DIRS = 10 -LEVEL2_DIRS = 10 -LEVEL3_DIRS = 8 - - -def discover_level3_dirs() -> list[str]: - """Discover all level3 directories under BASE_DIR (deterministic order).""" - dirs = [] - for i0 in range(LEVEL0_DIRS): - d0 = os.path.join(BASE_DIR, f"level0_{i0:02d}") - if not os.path.isdir(d0): - continue - for i1 in range(LEVEL1_DIRS): - d1 = os.path.join(d0, f"level1_{i1:02d}") - if not os.path.isdir(d1): - continue - for i2 in range(LEVEL2_DIRS): - d2 = os.path.join(d1, f"level2_{i2:02d}") - if not os.path.isdir(d2): - continue - for i3 in range(LEVEL3_DIRS): - d3 = os.path.join(d2, f"level3_{i3:02d}") - if os.path.isdir(d3): - dirs.append(os.path.relpath(d3, BASE_DIR)) - return dirs - - -def load_progress() -> set: - """Load set of already-indexed level3 relative paths from progress file.""" - done = set() - if os.path.exists(PROGRESS_FILE): - with open(PROGRESS_FILE) as f: - for line in f: - line = line.strip() - if line: - done.add(line) - return done - - -def save_progress(rel_path: str) -> None: - """Append a completed level3 relative path to the progress file.""" - with open(PROGRESS_FILE, "a") as f: - f.write(rel_path + "\n") - f.flush() - os.fsync(f.fileno()) - - -def run_cmd(cmd: list[str]) -> tuple[int, str, str, float]: - """Run command, return (returncode, stdout, stderr, elapsed_seconds).""" - t0 = time.time() - result = subprocess.run(cmd, capture_output=True, text=True, timeout=600) - elapsed = time.time() - t0 - return result.returncode, result.stdout, result.stderr, elapsed - - -def main(): - parser = argparse.ArgumentParser( - description="Step 3: Build index — trigger VLM+embedding via ov reindex" - ) - parser.add_argument( - "--no-resume", action="store_true", help="Disable auto-resume, start from scratch" - ) - parser.add_argument( - "--mode", - choices=["vectors_only", "semantic_and_vectors"], - default="vectors_only", - help="Reindex mode (default: vectors_only = embedding)", - ) - parser.add_argument( - "--max-failures", type=int, default=50, help="Abort after N failures (default: 50)" - ) - args = parser.parse_args() - - level3_dirs = discover_level3_dirs() - - if not level3_dirs: - print(f"No level3 directories found under {BASE_DIR}") - print("Did you run step1_generate.py and step2_quick_add_resource.py first?") - sys.exit(1) - - # Load resume state - done_set = set() - if not args.no_resume: - done_set = load_progress() - if done_set: - print(f"Resuming: {len(done_set)} dirs already indexed (from {PROGRESS_FILE})") - - count = 0 - skipped = 0 - failed = 0 - total = len(level3_dirs) - print(f"{total} level3 dirs to index, {len(done_set)} already done") - - for rel_dir in level3_dirs: - if rel_dir in done_set: - skipped += 1 - continue - - uri = f"{BENCHMARK_URI}/{rel_dir}" - cmd = [ - "ov", - "reindex", - "--account", - "default", - "--user", - "default", - "--mode", - args.mode, - "--wait", - "true", - uri, - ] - idx = count + skipped + 1 - cmd_str = shlex.join(cmd) - print(f"[{idx}/{total}] $ {cmd_str}") - - try: - rc, stdout, stderr, elapsed = run_cmd(cmd) - - if stdout.strip(): - for line in stdout.strip().splitlines(): - print(f" {line}") - if stderr.strip(): - for line in stderr.strip().splitlines(): - print(f" [stderr] {line}") - - if rc != 0: - print(f" FAILED (exit={rc}, {elapsed:.1f}s)") - failed += 1 - else: - print(f" OK ({elapsed:.1f}s)") - save_progress(rel_dir) - except subprocess.TimeoutExpired: - print(" TIMEOUT (600s)") - failed += 1 - except Exception as e: - print(f" ERROR: {e}") - failed += 1 - - if failed >= args.max_failures: - print(f"\nToo many failures ({failed}), aborting. Re-run to resume.") - sys.exit(1) - - count += 1 - if count % 100 == 0: - print(f" ... {count} dirs indexed this run ({failed} failed, {skipped} skipped)") - - print(f"\nDone! {count} dirs indexed, {skipped} skipped, {failed} failed") - if failed == 0: - print("Next step: run step4_benchmark.py to measure grep performance") - - -if __name__ == "__main__": - main() diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step3_retrieval_quality.py b/benchmark/retrieval/grep/vikingdb_bm25/step3_retrieval_quality.py new file mode 100644 index 0000000000..a4aae34a8f --- /dev/null +++ b/benchmark/retrieval/grep/vikingdb_bm25/step3_retrieval_quality.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python3 +"""Step 3: Retrieval quality evaluation — compare auto (bm25) vs fs grep. + +Prerequisites: + 1. Run step1_add_resource.py to import repos (includes VLM+embedding) + 2. Ensure ov.conf has: + "grep": {"engine": "auto", "switch_to_remote_threshold": 0} + (switch_to_remote_threshold = 0 forces VikingDB BM25 for all queries) + 3. Restart the server after changing ov.conf + +Approach: + - Ground truth: scan local repo files with Python regex (equivalent to fs engine) + - Test: call SDK grep to get structured results + - Compare: compute Recall, Precision, F1 per query pattern + +KEYWORDS: Fill the KEYWORDS list below with real terms from the imported +repos. Each keyword will be tested for retrieval quality. + +Usage: + python3 step3_retrieval_quality.py +""" + +from __future__ import annotations + +import os +import re +import time + +from openviking.sync_client import SyncOpenViking + +BASE_URI = "viking://resources/benchmark" +DATA_DIR = os.path.expanduser("~/.openviking/data") + +KEYWORDS: list[str] = [] + + +def build_test_patterns() -> list[tuple[str, str]]: + patterns = [] + + for kw in KEYWORDS: + patterns.append((f"keyword: {kw}", kw)) + + if len(KEYWORDS) >= 2: + patterns.append((f"multi 2: {KEYWORDS[0]}|{KEYWORDS[1]}", f"{KEYWORDS[0]}|{KEYWORDS[1]}")) + + patterns.append(("no-match: zzz_nonexistent_benchmark", "zzz_nonexistent_benchmark")) + + return patterns + + +def run_sdk_grep(client: SyncOpenViking, uri: str, pattern: str) -> tuple[set[str], float]: + t0 = time.monotonic() + result = client.grep(uri=uri, pattern=pattern, node_limit=100000) + elapsed = time.monotonic() - t0 + + uris = set() + if isinstance(result, dict): + for match in result.get("matches", []): + uri_val = match.get("uri", "") + if uri_val: + uris.add(uri_val.rstrip("/")) + return uris, elapsed + + +def local_path_to_viking_uri(filepath: str) -> str: + rel = os.path.relpath(filepath, DATA_DIR) + return "viking://resources/" + rel.replace(os.sep, "/").rstrip("/") + + +def compute_ground_truth(pattern: str, search_dirs: list[str]) -> tuple[set[str], float]: + compiled = re.compile(pattern) + truth_uris = set() + t0 = time.monotonic() + for search_dir in search_dirs: + if not os.path.isdir(search_dir): + continue + for root, dirs, files in os.walk(search_dir): + dirs.sort() + for fname in sorted(files): + if not ( + fname.endswith(".py") + or fname.endswith(".md") + or fname.endswith(".rs") + or fname.endswith(".toml") + or fname.endswith(".yaml") + or fname.endswith(".yml") + or fname.endswith(".json") + or fname.endswith(".txt") + or fname.endswith(".cfg") + or fname.endswith(".ini") + ): + continue + filepath = os.path.join(root, fname) + try: + with open(filepath, errors="ignore") as f: + content = f.read() + if compiled.search(content): + truth_uris.add(local_path_to_viking_uri(filepath)) + except Exception: + pass + elapsed = time.monotonic() - t0 + return truth_uris, elapsed + + +def discover_local_repo_dirs() -> list[str]: + benchmark_dir = os.path.join(DATA_DIR, "benchmark") + if not os.path.isdir(benchmark_dir): + return [] + dirs = [] + for entry in sorted(os.listdir(benchmark_dir)): + path = os.path.join(benchmark_dir, entry) + if os.path.isdir(path) and not entry.startswith("."): + dirs.append(path) + return dirs + + +def compute_metrics(truth: set[str], predicted: set[str]) -> dict: + if not truth and not predicted: + return {"recall": 1.0, "precision": 1.0, "f1": 1.0, "tp": 0, "fp": 0, "fn": 0} + if not truth: + return {"recall": 0.0, "precision": 0.0, "f1": 0.0, "tp": 0, "fp": len(predicted), "fn": 0} + + tp = len(truth & predicted) + fp = len(predicted - truth) + fn = len(truth - predicted) + recall = tp / len(truth) + precision = tp / len(predicted) if predicted else 0.0 + f1 = 2 * recall * precision / (recall + precision) if (recall + precision) > 0 else 0.0 + return {"recall": recall, "precision": precision, "f1": f1, "tp": tp, "fp": fp, "fn": fn} + + +def main(): + uri = BASE_URI + search_dirs = discover_local_repo_dirs() + + if not search_dirs: + print(f"Error: No repo directories found under {DATA_DIR}/benchmark/") + print("Run step1_add_resource.py first.") + return + + if not KEYWORDS: + print("WARNING: KEYWORDS list is empty. Fill it with real terms before running.") + print(" Edit step3_retrieval_quality.py and add keywords to the KEYWORDS list.\n") + + test_patterns = build_test_patterns() + + print("=" * 110) + print("Retrieval Quality Evaluation: auto (bm25+fs) vs local fs (ground truth)") + print("=" * 110) + print(f"URI: {uri}") + print(f"Data dir: {DATA_DIR}/benchmark/") + print(f"Local dirs: {search_dirs}") + print(f"Patterns: {len(test_patterns)}") + print() + print("Ensure ov.conf has:") + print(' "grep": {"engine": "auto", "switch_to_remote_threshold": 0}') + print("And the server has been restarted.") + print() + + results = [] + + client = SyncOpenViking() + client.initialize() + + try: + for label, pattern in test_patterns: + print(f"--- {label} (pattern: {pattern}) ---") + + truth_uris, fs_elapsed = compute_ground_truth(pattern, search_dirs) + print(f" Ground truth (local fs): {len(truth_uris)} matches ({fs_elapsed:.2f}s)") + + try: + auto_uris, auto_elapsed = run_sdk_grep(client, uri, pattern) + except Exception as e: + print(f" Auto grep FAILED: {e}") + results.append( + { + "label": label, + "pattern": pattern, + "error": str(e), + "truth_count": len(truth_uris), + } + ) + continue + print(f" Auto grep (bm25+fs): {len(auto_uris)} matches ({auto_elapsed:.2f}s)") + + metrics = compute_metrics(truth_uris, auto_uris) + print( + f" Recall: {metrics['recall']:.4f} " + f"Precision: {metrics['precision']:.4f} " + f"F1: {metrics['f1']:.4f}" + ) + if metrics["fn"] > 0: + print(f" Missed (FN): {metrics['fn']}") + if metrics["fp"] > 0: + print(f" Extra (FP): {metrics['fp']}") + + if metrics["fn"] > 0: + missed = sorted(truth_uris - auto_uris)[:5] + print(" Sample missed URIs:") + for u in missed: + print(f" {u}") + + results.append( + { + "label": label, + "pattern": pattern, + "truth_count": len(truth_uris), + "auto_count": len(auto_uris), + "fs_elapsed_s": round(fs_elapsed, 3), + "auto_elapsed_s": round(auto_elapsed, 3), + **metrics, + } + ) + finally: + client.close() + + print() + print("=" * 120) + print( + f"{'Label':<45} {'Truth':>6} {'Auto':>6} {'Recall':>8} {'Prec':>8} {'F1':>8} {'Missed':>8}" + ) + print("-" * 120) + for r in results: + if "error" in r: + print( + f"{r['label']:<45} {r['truth_count']:>6} {'ERR':>6} " + f"{'---':>8} {'---':>8} {'---':>8} {'---':>8}" + ) + else: + print( + f"{r['label']:<45} {r['truth_count']:>6} {r['auto_count']:>6} " + f"{r['recall']:>8.4f} {r['precision']:>8.4f} {r['f1']:>8.4f} {r['fn']:>8}" + ) + print() + + has_recall_loss = any(r.get("fn", 0) > 0 for r in results) + has_precision_loss = any(r.get("fp", 0) > 0 for r in results) + if not has_recall_loss and not has_precision_loss: + print( + "VERDICT: All queries achieved perfect recall and precision. bm25 recall is complete." + ) + else: + if has_recall_loss: + print("VERDICT: Recall loss detected — some files not recalled by bm25.") + print( + " Possible causes: content field truncation, tokenizer mismatch, or incomplete reindex." + ) + if has_precision_loss: + print("VERDICT: Precision loss detected — unexpected matches in auto results.") + print( + " This should not happen (phase 2 regex guarantees precision). Investigate URI format." + ) + print() + + +if __name__ == "__main__": + main() diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step5_retrieval_quality.py b/benchmark/retrieval/grep/vikingdb_bm25/step5_retrieval_quality.py deleted file mode 100644 index 06d4f8330e..0000000000 --- a/benchmark/retrieval/grep/vikingdb_bm25/step5_retrieval_quality.py +++ /dev/null @@ -1,266 +0,0 @@ -#!/usr/bin/env python3 -"""Step 5: Retrieval quality evaluation — compare auto (bm25) vs fs grep. - -Prerequisites: - 1. Run step1_generate.py to create test data - 2. Run step2_quick_add_resource.py to upload files - 3. Run step3_build_index.py to build index (embedding + content) - 4. Ensure ov.conf has: - "grep": {"engine": "auto", "switch_to_remote_threshold": 0} - (switch_to_remote_threshold = 0 forces VikingDB BM25 for all queries) - 5. Restart the server after changing ov.conf - -Approach: - - Ground truth: scan local benchmark files with Python regex (equivalent to fs engine) - - Test: call `ov grep` CLI with --output json to get structured results - - Compare: compute Recall, Precision, F1 per query pattern - -NOTE: `remote_return_limit` defaults to 0 (auto-adapt to 100000), so bm25 recall -is not truncated. No need to test different limit values. - -Usage: - python3 step5_retrieval_quality.py [--uri URI] -""" - -import json -import os -import re -import shlex -import subprocess -import time - -BASE_URI = "viking://resources/benchmark" -OV_CMD = ["ov", "--account", "default", "--user", "default"] -DATA_DIR = os.path.expanduser("~/.openviking/data") -BENCHMARK_DIR = os.path.join(DATA_DIR, "benchmark") - -# Test patterns covering different keyword types -# (label, pattern) -TEST_PATTERNS = [ - # CamelCase - ("CamelCase: VikingDB", "VikingDB"), - # PascalCase - ("PascalCase: FullText", "FullText"), - # lowercase - ("lowercase: bm25", "bm25"), - # snake_case - ("snake_case: search_by_keywords", "search_by_keywords"), - # Multi-keyword regex - ("multi: VikingDB|FullText", "VikingDB|FullText"), - ("multi: VikingDB|FullText|bm25", "VikingDB|FullText|bm25"), - # No-match - ("no-match: zzz_nonexistent", "zzz_nonexistent"), -] - - -def run_ov_grep(uri: str, pattern: str) -> tuple[set[str], float]: - """Run `ov grep --output json` and extract matched URIs.""" - cmd = OV_CMD + [ - "--output", - "json", - "grep", - "--uri", - uri, - "-n", - "100000", - pattern, - ] - - cmd_str = shlex.join(cmd) - print(f" $ {cmd_str}") - - t0 = time.monotonic() - result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) - elapsed = time.monotonic() - t0 - - if result.returncode != 0: - stderr = result.stderr.strip()[:200] - raise RuntimeError(f"ov grep failed (exit={result.returncode}): {stderr}") - - # Find the JSON line in stdout (skip echo_command output like "cmd: ov grep ...") - json_line = None - for line in result.stdout.strip().splitlines(): - line = line.strip() - if line.startswith("{"): - json_line = line - break - if not json_line: - raise RuntimeError(f"No JSON output from ov grep. stdout: {result.stdout[:200]}") - - # Parse JSON response: {"status": "ok", "result": {"matches": [...], ...}} - try: - resp = json.loads(json_line) - except json.JSONDecodeError as e: - raise RuntimeError(f"Failed to parse ov grep output: {e}") from e - - grep_result = resp.get("result", resp) - uris = set() - for match in grep_result.get("matches", []): - uri = match.get("uri", "") - if uri: - uris.add(uri.rstrip("/")) - return uris, elapsed - - -def local_path_to_viking_uri(filepath: str) -> str: - """Convert a local benchmark file path to a viking URI.""" - rel = os.path.relpath(filepath, DATA_DIR) - return "viking://resources/" + rel.replace(os.sep, "/").rstrip("/") - - -def compute_ground_truth(pattern: str) -> tuple[set[str], float]: - """Scan local benchmark files with Python regex to get ground truth.""" - compiled = re.compile(pattern) - truth_uris = set() - t0 = time.monotonic() - for root, dirs, files in os.walk(BENCHMARK_DIR): - dirs.sort() - for fname in sorted(files): - if not fname.endswith(".md"): - continue - filepath = os.path.join(root, fname) - try: - with open(filepath) as f: - content = f.read() - if compiled.search(content): - truth_uris.add(local_path_to_viking_uri(filepath)) - except Exception: - pass - elapsed = time.monotonic() - t0 - return truth_uris, elapsed - - -def compute_metrics(truth: set[str], predicted: set[str]) -> dict: - """Compute recall, precision, F1.""" - if not truth and not predicted: - return {"recall": 1.0, "precision": 1.0, "f1": 1.0, "tp": 0, "fp": 0, "fn": 0} - if not truth: - return {"recall": 0.0, "precision": 0.0, "f1": 0.0, "tp": 0, "fp": len(predicted), "fn": 0} - - tp = len(truth & predicted) - fp = len(predicted - truth) - fn = len(truth - predicted) - recall = tp / len(truth) - precision = tp / len(predicted) if predicted else 0.0 - f1 = 2 * recall * precision / (recall + precision) if (recall + precision) > 0 else 0.0 - return {"recall": recall, "precision": precision, "f1": f1, "tp": tp, "fp": fp, "fn": fn} - - -def main(): - uri = BASE_URI - - if not os.path.isdir(BENCHMARK_DIR): - print(f"Error: Benchmark data not found at {BENCHMARK_DIR}") - print("Run step1_generate.py first.") - return - - print("=" * 100) - print("Retrieval Quality Evaluation: auto (bm25+fs) vs local fs (ground truth)") - print("=" * 100) - print(f"URI: {uri}") - print(f"Data dir: {BENCHMARK_DIR}") - print() - print("Ensure ov.conf has:") - print(' "grep": {"engine": "auto", "switch_to_remote_threshold": 0}') - print("And the server has been restarted.") - print() - - results = [] - - for label, pattern in TEST_PATTERNS: - print(f"--- {label} (pattern: {pattern}) ---") - - # Ground truth: scan local files - truth_uris, fs_elapsed = compute_ground_truth(pattern) - print(f" Ground truth (local fs): {len(truth_uris)} matches ({fs_elapsed:.2f}s)") - - # Auto grep (via ov CLI with --output json) - try: - auto_uris, auto_elapsed = run_ov_grep(uri, pattern) - except Exception as e: - print(f" Auto grep FAILED: {e}") - results.append( - { - "label": label, - "pattern": pattern, - "error": str(e), - "truth_count": len(truth_uris), - } - ) - continue - print(f" Auto grep (bm25+fs): {len(auto_uris)} matches ({auto_elapsed:.2f}s)") - - # Compute metrics - metrics = compute_metrics(truth_uris, auto_uris) - print( - f" Recall: {metrics['recall']:.4f} " - f"Precision: {metrics['precision']:.4f} " - f"F1: {metrics['f1']:.4f}" - ) - if metrics["fn"] > 0: - print(f" Missed (FN): {metrics['fn']}") - if metrics["fp"] > 0: - print(f" Extra (FP): {metrics['fp']}") - - # Show sample missed URIs for debugging - if metrics["fn"] > 0: - missed = sorted(truth_uris - auto_uris)[:5] - print(" Sample missed URIs:") - for u in missed: - print(f" {u}") - - results.append( - { - "label": label, - "pattern": pattern, - "truth_count": len(truth_uris), - "auto_count": len(auto_uris), - "fs_elapsed_s": round(fs_elapsed, 3), - "auto_elapsed_s": round(auto_elapsed, 3), - **metrics, - } - ) - - # Summary table - print() - print("=" * 110) - print( - f"{'Label':<40} {'Truth':>6} {'Auto':>6} {'Recall':>8} {'Prec':>8} {'F1':>8} {'Missed':>8}" - ) - print("-" * 110) - for r in results: - if "error" in r: - print( - f"{r['label']:<40} {r['truth_count']:>6} {'ERR':>6} " - f"{'---':>8} {'---':>8} {'---':>8} {'---':>8}" - ) - else: - print( - f"{r['label']:<40} {r['truth_count']:>6} {r['auto_count']:>6} " - f"{r['recall']:>8.4f} {r['precision']:>8.4f} {r['f1']:>8.4f} {r['fn']:>8}" - ) - print() - - # Verdict - has_recall_loss = any(r.get("fn", 0) > 0 for r in results) - has_precision_loss = any(r.get("fp", 0) > 0 for r in results) - if not has_recall_loss and not has_precision_loss: - print( - "VERDICT: All queries achieved perfect recall and precision. bm25 recall is complete." - ) - else: - if has_recall_loss: - print("VERDICT: Recall loss detected — some files not recalled by bm25.") - print( - " Possible causes: content field truncation, tokenizer mismatch, or incomplete reindex." - ) - if has_precision_loss: - print("VERDICT: Precision loss detected — unexpected matches in auto results.") - print( - " This should not happen (phase 2 regex guarantees precision). Investigate URI format." - ) - print() - - -if __name__ == "__main__": - main() From 7337653f055ba4a7118e71457898d040bb4ada39 Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Fri, 29 May 2026 15:18:00 +0800 Subject: [PATCH 14/31] fix: step1 add resource by real code data --- .../grep/vikingdb_bm25/step1_add_resource.py | 158 +++++++++++++----- openviking/client/local.py | 73 ++++++++ 2 files changed, 190 insertions(+), 41 deletions(-) diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step1_add_resource.py b/benchmark/retrieval/grep/vikingdb_bm25/step1_add_resource.py index 81f18d1ab3..ed88719ded 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/step1_add_resource.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/step1_add_resource.py @@ -1,40 +1,79 @@ #!/usr/bin/env python3 -"""Step 1: Import GitHub repositories as benchmark data via OpenViking SDK. +"""Step 1: Import local code directories as benchmark data via OpenViking SDK. -Imports one or more GitHub repositories into OpenViking using the Python SDK -(SyncOpenViking) to avoid HTTP timeout issues with the CLI. Resources are -added with wait=True so VLM summarization and embedding are completed -before the script returns. +Recursively scans a local directory, imports each subdirectory (at all depths) +separately via SyncOpenViking.add_resource (wait=True), and saves progress +after each directory for resumability. Directory order is deterministic +(sorted at each level). Usage: python3 step1_add_resource.py - python3 step1_add_resource.py --repos "https://github.com/volcengine/OpenViking" - python3 step1_add_resource.py --repos "https://github.com/volcengine/OpenViking" "https://github.com/another/repo" + python3 step1_add_resource.py --source ~/.openviking/data/benchmark/OpenViking-main """ from __future__ import annotations import argparse +import os import time from openviking.sync_client import SyncOpenViking -DEFAULT_REPOS = [ - "https://github.com/volcengine/OpenViking", -] - +DEFAULT_SOURCE = os.path.expanduser("~/.openviking/data/benchmark/OpenViking-main") +PROGRESS_FILE = os.path.expanduser("~/.openviking/data/benchmark/.code-import-progress") BENCHMARK_PARENT = "viking://resources/benchmark" +def load_progress() -> set[str]: + """Load completed directory names from progress file.""" + if not os.path.exists(PROGRESS_FILE): + return set() + with open(PROGRESS_FILE) as f: + return {line.strip() for line in f if line.strip()} + + +def save_progress(dir_name: str) -> None: + """Append a completed directory name to progress file.""" + os.makedirs(os.path.dirname(PROGRESS_FILE), exist_ok=True) + with open(PROGRESS_FILE, "a") as f: + f.write(dir_name + "\n") + + +def scan_subdirs_recursive(root: str) -> list[str]: + """Return sorted list of all subdirectory relative paths under root (recursive, deterministic order). + + Skips hidden directories (starting with '.'). Order is deterministic: + sorted at each level, parent before children. + """ + result: list[str] = [] + + def _walk(dir_path: str, rel_prefix: str) -> None: + try: + entries = sorted(os.listdir(dir_path)) + except OSError: + return + for name in entries: + if name.startswith("."): + continue + full = os.path.join(dir_path, name) + if not os.path.isdir(full): + continue + rel = f"{rel_prefix}/{name}" if rel_prefix else name + result.append(rel) + _walk(full, rel) + + _walk(root, "") + return result + + def main(): parser = argparse.ArgumentParser( - description="Step 1: Import GitHub repos as benchmark data via SDK" + description="Step 1: Import local code directories as benchmark data via SDK" ) parser.add_argument( - "--repos", - nargs="+", - default=DEFAULT_REPOS, - help="GitHub repo URLs to import (default: OpenViking repo)", + "--source", + default=DEFAULT_SOURCE, + help=f"Local directory to import (default: {DEFAULT_SOURCE})", ) parser.add_argument( "--parent", @@ -43,47 +82,71 @@ def main(): ) args = parser.parse_args() + source = os.path.expanduser(args.source) + if not os.path.isdir(source): + print(f"ERROR: Source directory does not exist: {source}") + return + print("=" * 80) - print("Step 1: Import GitHub Repositories as Benchmark Data") + print("Step 1: Import Local Code Directories as Benchmark Data") print("=" * 80) - print(f" Repos: {args.repos}") - print(f" Parent: {args.parent}") + print(f" Source: {source}") + print(f" Parent: {args.parent}") + print(f" Progress: {PROGRESS_FILE}") print() + # Scan subdirectories recursively + subdirs = scan_subdirs_recursive(source) + total = len(subdirs) + print(f" Total directories to import: {total}") + print() + + if total == 0: + print("No subdirectories found. Nothing to import.") + return + + # Load progress + completed = load_progress() + if completed: + already_done = [d for d in subdirs if d in completed] + print(f" Resuming: {len(already_done)} directories already imported") + print() + client = SyncOpenViking() client.initialize() results = [] - for repo_url in args.repos: - repo_name = repo_url.rstrip("/").split("/")[-1] - print(f"--- Importing {repo_name} ---") + for i, rel_dir in enumerate(subdirs, 1): + if rel_dir in completed: + print(f" [{i}/{total}] SKIP (already done): {rel_dir}") + continue + + dir_path = os.path.join(source, rel_dir) + # Build parent URI: viking://resources/benchmark/ + parent_rel = os.path.dirname(rel_dir) + parent_uri = f"{args.parent}/{parent_rel}" if parent_rel else args.parent + print(f" [{i}/{total}] Importing: {rel_dir} ...", end="", flush=True) t0 = time.monotonic() try: result = client.add_resource( - path=repo_url, - parent=args.parent, - reason=f"benchmark data: {repo_name}", + path=dir_path, + parent=parent_uri, + reason=f"benchmark data: {rel_dir}", wait=True, create_parent=True, ) elapsed = time.monotonic() - t0 root_uri = result.get("root_uri", "?") - print(f" OK ({elapsed:.1f}s) -> {root_uri}") - results.append( - { - "repo": repo_url, - "status": "ok", - "elapsed_s": round(elapsed, 1), - "root_uri": root_uri, - } - ) + print(f" OK ({elapsed:.1f}s) -> {root_uri}") + save_progress(rel_dir) + results.append({"dir": rel_dir, "status": "ok", "elapsed_s": round(elapsed, 1)}) except Exception as e: elapsed = time.monotonic() - t0 - print(f" FAILED ({elapsed:.1f}s): {e}") + print(f" FAILED ({elapsed:.1f}s): {e}") results.append( { - "repo": repo_url, + "dir": rel_dir, "status": "failed", "elapsed_s": round(elapsed, 1), "error": str(e)[:500], @@ -92,20 +155,33 @@ def main(): client.close() + # Summary print() print("Summary:") for r in results: status = r["status"] - repo = r["repo"] + dir_name = r["dir"] elapsed = r["elapsed_s"] - print(f" {status.upper():>7s} {repo} ({elapsed}s)") + line = f" {status.upper():>7s} {dir_name} ({elapsed}s)" + if status == "failed": + line += f" -- {r.get('error', '')}" + print(line) ok_count = sum(1 for r in results if r["status"] == "ok") - if ok_count == len(results): - print(f"\nAll {ok_count} repos imported and processed successfully.") + failed_count = sum(1 for r in results if r["status"] == "failed") + skipped_count = sum(1 for d in subdirs if d in completed) + total_done = skipped_count + ok_count + + print() + if total_done >= total and failed_count == 0: + print(f"All {total} directories imported and processed successfully.") print("Next step: run step2_benchmark.py to measure grep performance") else: - print(f"\n{ok_count}/{len(results)} repos imported successfully. Check errors above.") + print( + f" Imported: {ok_count} Failed: {failed_count} Skipped: {skipped_count} Remaining: {total - total_done}" + ) + if failed_count > 0: + print("Re-run this script to resume from where it left off.") if __name__ == "__main__": diff --git a/openviking/client/local.py b/openviking/client/local.py index 191a7fc48a..416316d945 100644 --- a/openviking/client/local.py +++ b/openviking/client/local.py @@ -553,6 +553,79 @@ async def _add_message_impl( "message_count": len(session.messages), } + async def batch_add_messages( + self, + session_id: str, + messages: list[dict], + telemetry: TelemetryRequest = False, + ) -> Dict[str, Any]: + """Add multiple messages to a session in a single request. + + Args: + session_id: Session ID + messages: List of message dicts, each with "role" and optionally + "content", "parts", "created_at", "role_id". + telemetry: Whether to attach operation telemetry data to the result. + + Returns: + Result dict with session_id, message_count, and added count. + """ + execution = await run_with_telemetry( + operation="session.batch_add_messages", + telemetry=telemetry, + fn=lambda: self._batch_add_messages_impl(session_id, messages), + ) + return attach_telemetry_payload( + execution.result, + execution.telemetry, + ) + + async def _batch_add_messages_impl( + self, + session_id: str, + messages: list[dict], + ) -> Dict[str, Any]: + from openviking.message.part import TextPart, part_from_dict + + session = await self._service.sessions.get(session_id, self._ctx, auto_create=True) + + specs = [] + for msg in messages: + role = msg.get("role") + if not role: + raise ValueError("Each message must have a 'role' key") + + parts_data = msg.get("parts") + content = msg.get("content") + if parts_data is not None: + message_parts = [part_from_dict(p) for p in parts_data] + elif content is not None: + message_parts = [TextPart(text=content)] + else: + raise ValueError("Each message must have either 'content' or 'parts'") + + role_id = msg.get("role_id") + if role_id is None and role == "user": + role_id = self._ctx.user.user_id + elif role_id is None and role == "assistant": + role_id = self._ctx.user.agent_id + + specs.append( + { + "role": role, + "parts": [p.model_dump() for p in message_parts], + "role_id": role_id, + "created_at": msg.get("created_at"), + } + ) + + added = session.add_messages(specs) + return { + "session_id": session_id, + "message_count": len(session.messages), + "added": len(added), + } + # ============= Pack ============= async def export_ovpack( From a03c61be6b6ef649dd6f4bba2d1bdce113440169 Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Fri, 29 May 2026 19:51:26 +0800 Subject: [PATCH 15/31] feat(benchmark): split grep benchmark into effectiveness/performance suites with async reindex --- .../retrieval/grep/vikingdb_bm25/README.md | 105 +++++ .../retrieval/grep/vikingdb_bm25/README_CN.md | 105 +++++ .../retrieval/grep/vikingdb_bm25/ai_wiki.txt | 369 ++++++++++++++++++ .../{ => effectiveness}/step1_add_resource.py | 49 ++- .../effectiveness/step2_reindex.py | 231 +++++++++++ .../step3_quality.py} | 83 +--- .../performance/step0_prepare_data.py | 206 ++++++++++ .../performance/step1_add_resource.py | 182 +++++++++ .../performance/step2_reindex.py | 231 +++++++++++ .../performance/step3_benchmark.py | 275 +++++++++++++ .../grep/vikingdb_bm25/step2_benchmark.py | 234 ----------- 11 files changed, 1748 insertions(+), 322 deletions(-) create mode 100644 benchmark/retrieval/grep/vikingdb_bm25/README.md create mode 100644 benchmark/retrieval/grep/vikingdb_bm25/README_CN.md create mode 100644 benchmark/retrieval/grep/vikingdb_bm25/ai_wiki.txt rename benchmark/retrieval/grep/vikingdb_bm25/{ => effectiveness}/step1_add_resource.py (77%) create mode 100644 benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step2_reindex.py rename benchmark/retrieval/grep/vikingdb_bm25/{step3_retrieval_quality.py => effectiveness/step3_quality.py} (69%) create mode 100644 benchmark/retrieval/grep/vikingdb_bm25/performance/step0_prepare_data.py create mode 100644 benchmark/retrieval/grep/vikingdb_bm25/performance/step1_add_resource.py create mode 100644 benchmark/retrieval/grep/vikingdb_bm25/performance/step2_reindex.py create mode 100644 benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py delete mode 100644 benchmark/retrieval/grep/vikingdb_bm25/step2_benchmark.py diff --git a/benchmark/retrieval/grep/vikingdb_bm25/README.md b/benchmark/retrieval/grep/vikingdb_bm25/README.md new file mode 100644 index 0000000000..d1f3a15b8e --- /dev/null +++ b/benchmark/retrieval/grep/vikingdb_bm25/README.md @@ -0,0 +1,105 @@ +# VikingDB BM25 Grep Benchmark + +Benchmark suite for evaluating OpenViking's grep retrieval with VikingDB BM25 engine. + +## Directory Structure + +``` +vikingdb_bm25/ +├── ai_wiki.txt # Source text for synthetic data generation +├── effectiveness/ # Retrieval effectiveness (recall/precision/F1) +│ ├── step1_add_resource.py +│ ├── step2_reindex.py +│ └── step3_quality.py +└── performance/ # Retrieval performance (latency + recall at scale) + ├── step0_prepare_data.py + ├── step1_add_resource.py + ├── step2_reindex.py + └── step3_benchmark.py +``` + +## Effectiveness — Retrieval Quality + +Tests whether grep can find **all** matching files in real code repositories. + +**Data source:** Real code repos (download manually, place under `~/.openviking/data/benchmark/`). + +| Step | Script | Description | +|------|--------|-------------| +| 1 | `step1_add_resource.py` | Import code repos (no indexing, fast) | +| 2 | `step2_reindex.py` | Async reindex via openviking-server (concurrency=2, polling) | +| 3 | `step3_quality.py` | Compare SDK grep vs local regex ground truth | + +### Usage + +```bash +# Step 1: Import repos (no VLM/embedding) +cd effectiveness/ +python3 step1_add_resource.py --source ~/.openviking/data/benchmark/OpenViking-main + +# Step 2: Build vector indexes (requires openviking-server running) +python3 step2_reindex.py +# Optional: --concurrency N (default: 2) + +# Step 3: Edit KEYWORDS list in step3_quality.py, then run +python3 step3_quality.py +``` + +## Performance — Latency & Recall at Scale + +Tests grep speed and recall on a large synthetic dataset (default: 100K files). + +**Data source:** Generated from `ai_wiki.txt` with target words injected at known probabilities. + +| Step | Script | Description | +|------|--------|-------------| +| 0 | `step0_prepare_data.py` | Generate synthetic dataset (dir_xxx/wiki_xxx.txt) | +| 1 | `step1_add_resource.py` | Import data (no VLM/embedding, fast) | +| 2 | `step2_reindex.py` | Async reindex via openviking-server (concurrency=2, polling) | +| 3 | `step3_benchmark.py` | Measure latency and recall | + +### Target Words + +12 words across 4 probability tiers: + +| Probability | Words | Expected hits (per 100K files) | +|-------------|-------|-------------------------------| +| 50% | quantumnexus, synapseflow, deepvector | ~50,000 | +| 10% | bm25engine, vikingcore, retrievex | ~10,000 | +| 0.1% | zephyrhash, cryptolattice, nebulalink | ~100 | +| 0.01% | xenoform, quarkpulse, omegabind | ~10 | + +### Usage + +```bash +cd performance/ + +# Step 0: Generate data (default: 100 dirs x 1000 files = 100K files) +python3 step0_prepare_data.py + +# Step 1: Import without indexing (fast) +python3 step1_add_resource.py + +# Step 2: Build vector indexes (requires openviking-server running) +python3 step2_reindex.py +# Optional: --concurrency N (default: 2) + +# Step 3: Benchmark — run with different engine configs +# Run A: fs engine +# 1. Set ov.conf: "grep": {"engine": "fs"} +# 2. Restart server +python3 step3_benchmark.py --engine-label fs + +# Run B: auto engine (bm25) +# 1. Set ov.conf: "grep": {"engine": "auto", "switch_to_remote_threshold": 0} +# 2. Restart server +python3 step3_benchmark.py --engine-label auto --compare step3_result_fs.json +``` + +## Key Concepts + +- **Effectiveness** tests use real code and measure exact recall/precision against ground truth +- **Performance** tests use synthetic data with known injection probabilities for approximate recall +- Both follow the same workflow: import (no indexing) → reindex → benchmark/evaluate +- Both support **resumable** execution via progress files (separate for import and reindex) +- Change grep engine via `ov.conf` and restart the server between benchmark runs diff --git a/benchmark/retrieval/grep/vikingdb_bm25/README_CN.md b/benchmark/retrieval/grep/vikingdb_bm25/README_CN.md new file mode 100644 index 0000000000..23caec61f5 --- /dev/null +++ b/benchmark/retrieval/grep/vikingdb_bm25/README_CN.md @@ -0,0 +1,105 @@ +# VikingDB BM25 Grep 基准测试 + +用于评估 OpenViking grep 检索配合 VikingDB BM25 引擎的基准测试套件。 + +## 目录结构 + +``` +vikingdb_bm25/ +├── ai_wiki.txt # 合成数据生成的原始文本 +├── effectiveness/ # 检索效果测试(召回率/精确率/F1) +│ ├── step1_add_resource.py +│ ├── step2_reindex.py +│ └── step3_quality.py +└── performance/ # 检索性能测试(延迟 + 大规模召回) + ├── step0_prepare_data.py + ├── step1_add_resource.py + ├── step2_reindex.py + └── step3_benchmark.py +``` + +## Effectiveness — 检索效果 + +测试 grep 在真实代码仓库中是否能找到**所有**匹配文件。 + +**数据来源:** 真实代码仓库(手动下载,放置于 `~/.openviking/data/benchmark/`)。 + +| 步骤 | 脚本 | 说明 | +|------|------|------| +| 1 | `step1_add_resource.py` | 导入代码仓库(不建索引,速度快) | +| 2 | `step2_reindex.py` | 通过 openviking-server 异步构建索引(并发=2,轮询) | +| 3 | `step3_quality.py` | SDK grep 与本地正则 ground truth 对比 | + +### 使用方法 + +```bash +# 步骤 1:导入代码仓库(不建索引) +cd effectiveness/ +python3 step1_add_resource.py --source ~/.openviking/data/benchmark/OpenViking-main + +# 步骤 2:构建向量索引(需 openviking-server 运行中) +python3 step2_reindex.py +# 可选参数:--concurrency N (默认:2) + +# 步骤 3:编辑 step3_quality.py 中的 KEYWORDS 列表,然后运行 +python3 step3_quality.py +``` + +## Performance — 检索性能 + +在大规模合成数据集(默认 10 万文件)上测试 grep 速度和召回率。 + +**数据来源:** 从 `ai_wiki.txt` 生成,按已知概率注入目标单词。 + +| 步骤 | 脚本 | 说明 | +|------|------|------| +| 0 | `step0_prepare_data.py` | 生成合成数据集(dir_xxx/wiki_xxx.txt) | +| 1 | `step1_add_resource.py` | 导入数据(不建索引,速度快) | +| 2 | `step2_reindex.py` | 通过 openviking-server 异步构建索引(并发=2,轮询) | +| 3 | `step3_benchmark.py` | 测量延迟和召回率 | + +### 目标单词 + +12 个单词,分 4 个概率层级: + +| 概率 | 单词 | 预期命中数(每 10 万文件) | +|------|------|---------------------------| +| 50% | quantumnexus, synapseflow, deepvector | ~50,000 | +| 10% | bm25engine, vikingcore, retrievex | ~10,000 | +| 0.1% | zephyrhash, cryptolattice, nebulalink | ~100 | +| 0.01% | xenoform, quarkpulse, omegabind | ~10 | + +### 使用方法 + +```bash +cd performance/ + +# 步骤 0:生成数据(默认:100 目录 x 1000 文件 = 10 万文件) +python3 step0_prepare_data.py + +# 步骤 1:导入数据(不建索引,速度快) +python3 step1_add_resource.py + +# 步骤 2:构建向量索引(需 openviking-server 运行中) +python3 step2_reindex.py +# 可选参数:--concurrency N (默认:2) + +# 步骤 3:基准测试 — 用不同引擎配置各跑一次 +# 运行 A:fs 引擎 +# 1. 设置 ov.conf: "grep": {"engine": "fs"} +# 2. 重启服务 +python3 step3_benchmark.py --engine-label fs + +# 运行 B:auto 引擎(bm25) +# 1. 设置 ov.conf: "grep": {"engine": "auto", "switch_to_remote_threshold": 0} +# 2. 重启服务 +python3 step3_benchmark.py --engine-label auto --compare step3_result_fs.json +``` + +## 核心概念 + +- **Effectiveness(效果测试)** 使用真实代码,对照 ground truth 计算精确的召回率/精确率 +- **Performance(性能测试)** 使用合成数据,根据已知注入概率计算近似召回率 +- 两者遵循相同流程:导入(不建索引)→ 构建索引 → 评估/测试 +- 两者均支持**断点续传**(导入和索引各有独立进度文件) +- 切换 grep 引擎需修改 `ov.conf` 并重启服务,在不同运行之间对比 diff --git a/benchmark/retrieval/grep/vikingdb_bm25/ai_wiki.txt b/benchmark/retrieval/grep/vikingdb_bm25/ai_wiki.txt new file mode 100644 index 0000000000..8030428061 --- /dev/null +++ b/benchmark/retrieval/grep/vikingdb_bm25/ai_wiki.txt @@ -0,0 +1,369 @@ +"AI" redirects here. For other uses, see AI (disambiguation) and Artificial intelligence (disambiguation). + +Artificial intelligence (AI) is the capability of computational systems to perform tasks typically associated with human intelligence, such as learning, reasoning, problem-solving, perception, and decision-making. It is a field of research in engineering, mathematics and computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligence to take actions that maximize their chances of achieving defined goals.[1] + +High-profile applications of AI include advanced web search engines, chatbots, virtual assistants, autonomous vehicles, and play and analysis in strategy games (e.g., chess and Go). Since the 2020s, generative AI has become widely available to generate images, audio, and videos from text prompts. + +The traditional goals of AI research include learning, reasoning, knowledge representation, planning, natural language processing, and perception, as well as support for robotics.[a] To reach these goals, AI researchers have used techniques including state space search and mathematical optimization, formal logic, artificial neural networks, and methods based on statistics, operations research, and economics.[b] AI also draws upon psychology, linguistics, philosophy, neuroscience, and other fields.[2] Some companies, such as OpenAI, Google DeepMind and Meta, aim to create artificial general intelligence (AGI) – AI that can complete virtually any cognitive task at least as well as a human.[3] + +Artificial intelligence was founded as an academic discipline in 1956,[4] and the field went through multiple cycles of optimism throughout its history,[5][6] followed by periods of disappointment and loss of funding, known as AI winters.[7][8] Funding and interest increased substantially after 2012, when graphics processing units began being used to accelerate neural networks, and deep learning outperformed previous AI techniques.[9] This growth accelerated further after 2017 with the transformer architecture.[10] In the 2020s, an AI boom has coincided with advances in generative AI, which allowed for the creation and modification of media. In addition to AI safety and unintended consequences and harms from the use of AI, ethical concerns, AI's long-term effects, and potential existential risks have prompted discussions of AI regulation. + +Goals +The general problem of simulating (or creating) intelligence has been broken into subproblems. These consist of particular traits or capabilities that researchers expect an intelligent system to display. The traits described below have received the most attention and cover the scope of AI research.[a] + +Reasoning and problem-solving +Early researchers developed algorithms that imitated step-by-step reasoning that humans use when they solve puzzles or make logical deductions.[11] By the late 1980s and 1990s, methods were developed for dealing with uncertain or incomplete information, employing concepts from probability and economics.[12] + +Many of these algorithms are insufficient for solving large reasoning problems because they experience a "combinatorial explosion": They become exponentially slower as the problems grow.[13] Even humans rarely use the step-by-step deduction that early AI research could model. They solve most of their problems using fast, intuitive judgments.[14] Accurate and efficient reasoning is an unsolved problem. + +Knowledge representation + +An ontology represents knowledge as a set of concepts within a domain and the relationships between those concepts. +Knowledge representation and knowledge engineering[15] allow AI programs to answer questions intelligently and make deductions about real-world facts. Formal knowledge representations are used in content-based indexing and retrieval,[16] scene interpretation,[17] clinical decision support,[18] knowledge discovery (mining "interesting" and actionable inferences from large databases),[19] and other areas.[20] + +A knowledge base is a body of knowledge represented in a form that can be used by a program. An ontology is the set of objects, relations, concepts, and properties used by a particular domain of knowledge.[21] Knowledge bases need to represent things such as objects, properties, categories, and relations between objects;[22] situations, events, states, and time;[23] causes and effects;[24] knowledge about knowledge (what we know about what other people know);[25] default reasoning (things that humans assume are true until they are told differently and will remain true even when other facts are changing);[26] and many other aspects and domains of knowledge. + +Among the most difficult problems in knowledge representation are the breadth of commonsense knowledge (the set of atomic facts that the average person knows is enormous);[27] and the sub-symbolic form of most commonsense knowledge (much of what people know is not represented as "facts" or "statements" that they could express verbally).[14] There is also the difficulty of knowledge acquisition, the problem of obtaining knowledge for AI applications.[c] + +Planning and decision-making +An "agent" is any entity (artificial or not) that perceives and takes actions in the world. A rational agent has goals or preferences and takes actions to make them happen.[d][30] In automated planning, the agent has a specific goal.[31] In automated decision-making, the agent has preferences—there are some situations it would prefer to be in, and some situations it is trying to avoid. The decision-making agent assigns a number to each situation (called the "utility") that measures how much the agent prefers it. For each possible action, it can calculate the "expected utility": the utility of all possible outcomes of the action, weighted by the probability that the outcome will occur. It can then choose the action with the maximum expected utility.[32] + +In classical planning, the agent knows exactly what the effect of any action will be.[33] In most real-world problems, however, the agent may not be certain about the situation they are in (it is "unknown" or "unobservable") and it may not know for certain what will happen after each possible action (it is not "deterministic"). It must choose an action by making a probabilistic guess and then reassess the situation to see if the action worked.[34] + +Alongside thorough testing and improvement based on previous decisions, having an explanation for why the agent took certain decisions is a way to build trust, especially when the decisions have to be relied upon.[35] + +In some problems, the agent's preferences may be uncertain, especially if there are other agents or humans involved. These can be learned (e.g., with inverse reinforcement learning), or the agent can seek information to improve its preferences.[36] Information value theory can be used to weigh the value of exploratory or experimental actions.[37] The space of possible future actions and situations is typically intractably large, so the agents must take actions and evaluate situations while being uncertain of what the outcome will be. + +A Markov decision process has a transition model that describes the probability that a particular action will change the state in a particular way and a reward function that supplies the utility of each state and the cost of each action. A policy associates a decision with each possible state. The policy could be calculated (e.g., by iteration), be heuristic, or it can be learned.[38] + +Game theory describes the rational behavior of multiple interacting agents and is used in AI programs that make decisions that involve other agents.[39] + +Learning +Machine learning is the study of programs that can improve their performance on a given task automatically.[40] It has been a part of AI from the beginning.[e] + + +In supervised learning, the training data is labelled with the expected answers, while in unsupervised learning, the model identifies patterns or structures in unlabelled data. +There are several kinds of machine learning. Unsupervised learning analyzes a stream of data and finds patterns and makes predictions without any other guidance.[43] Supervised learning requires labeling the training data with the expected answers, and comes in two main varieties: classification (where the program must learn to predict what category the input belongs in) and regression (where the program must deduce a numeric function based on numeric input).[44] + +In reinforcement learning, the agent is rewarded for good responses and punished for bad ones. The agent learns to choose responses that are classified as "good".[45] Transfer learning is when the knowledge gained from one problem is applied to a new problem.[46] Deep learning is a type of machine learning that runs inputs through biologically inspired artificial neural networks for all of these types of learning.[47] + +Computational learning theory can assess learners by computational complexity, by sample complexity (how much data is required), or by other notions of optimization.[48] + +Natural language processing +Natural language processing (NLP) allows programs to read, write and communicate in human languages.[49] Specific problems include speech recognition, speech synthesis, machine translation, information extraction, information retrieval and question answering.[50] + +Early work, based on Noam Chomsky's generative grammar and semantic networks, had difficulty with word-sense disambiguation[f] unless restricted to small domains called "micro-worlds" (due to the common sense knowledge problem[27]). Margaret Masterman believed that it was meaning and not grammar that was the key to understanding languages, and that thesauri and not dictionaries should be the basis of computational language structure. + +Modern deep learning techniques for NLP include word embedding (representing words, typically as vectors encoding their meaning),[51] transformers (a deep learning architecture using an attention mechanism),[52] and others.[53] In 2019, generative pre-trained transformer (or "GPT") language models began to generate coherent text,[54][55] and by 2023, these models were able to get human-level scores on the bar exam, SAT test, GRE test, and many other real-world applications.[56] + +Perception +Machine perception is the ability to use input from sensors (such as cameras, microphones, wireless signals, active lidar, sonar, radar, and tactile sensors) to deduce aspects of the world. Computer vision is the ability to analyze visual input.[57] + +The field includes speech recognition,[58] image classification,[59] facial recognition, object recognition,[60] object tracking,[61] and robotic perception.[62] + +Social intelligence + +Kismet, a robot head made in the 1990s, is a machine that can recognize and simulate emotions.[63] +Affective computing is a field that comprises systems that recognize, interpret, process, or simulate human feeling, emotion, and mood.[64] For example, some virtual assistants are programmed to speak conversationally or even to banter humorously; it makes them appear more sensitive to the emotional dynamics of human interaction, or to otherwise facilitate human–computer interaction. + +However, this tends to give naïve users an unrealistic conception of the intelligence of existing computer agents.[65] Moderate successes related to affective computing include textual sentiment analysis and, more recently, multimodal sentiment analysis, wherein AI classifies the effects displayed by a videotaped subject.[66] + +General intelligence +A machine with artificial general intelligence would be able to solve a wide variety of problems with breadth and versatility similar to human intelligence.[67] + +Techniques +AI research uses a wide variety of techniques to accomplish the goals above.[b] + +Search and optimization +There are two different kinds of search used in AI: state space search and local search: + +State space search +State space search searches through a tree of possible states to try to find a goal state.[68] For example, planning algorithms search through trees of goals and subgoals, attempting to find a path to a target goal, a process called means-ends analysis.[69] + +Simple exhaustive searches[70] are rarely sufficient for most real-world problems: the search space (the number of places to search) quickly grows to astronomical numbers. The result is a search that is too slow or never completes.[13] "Heuristics" or "rules of thumb" can help prioritize choices that are more likely to reach a goal.[71] + +Adversarial search is used for game-playing programs, such as chess or Go. It searches through a tree of possible moves and countermoves, looking for a winning position.[72] + +Local search + +Illustration of gradient descent for three different starting points; two parameters (represented by the plan coordinates) are adjusted in order to minimize the loss function (the height). +Local search uses mathematical optimization to find a solution to a problem. It begins with some form of guess and refines it incrementally.[73] + +Gradient descent is a type of local search that optimizes a set of numerical parameters by incrementally adjusting them to minimize a loss function. Variants of gradient descent are commonly used to train neural networks,[74] through the backpropagation algorithm. + +Another type of local search is evolutionary computation, which aims to iteratively improve a set of candidate solutions by "mutating" and "recombining" them, selecting only the fittest to survive each generation.[75] + +Distributed search processes can coordinate via swarm intelligence algorithms. Two popular swarm algorithms used in search are particle swarm optimization (inspired by bird flocking) and ant colony optimization (inspired by ant trails).[76] + +Logic +Formal logic is used for reasoning and knowledge representation.[77] Formal logic comes in two main forms: propositional logic (which operates on statements that are true or false and uses logical connectives such as "and", "or", "not" and "implies")[78] and predicate logic (which also operates on objects, predicates and relations and uses quantifiers such as "Every X is a Y" and "There are some Xs that are Ys").[79] + +Deductive reasoning in logic is the process of proving a new statement (conclusion) from other statements that are given and assumed to be true (the premises).[80] Proofs can be structured as proof trees, in which nodes are labelled by sentences, and children nodes are connected to parent nodes by inference rules. + +Given a problem and a set of premises, problem-solving reduces to searching for a proof tree whose root node is labelled by a solution of the problem and whose leaf nodes are labelled by premises or axioms. In the case of Horn clauses, problem-solving search can be performed by reasoning forwards from the premises or backwards from the problem.[81] In the more general case of the clausal form of first-order logic, resolution is a single, axiom-free rule of inference, in which a problem is solved by proving a contradiction from premises that include the negation of the problem to be solved.[82] + +Inference in both Horn clause logic and first-order logic is undecidable, and therefore intractable. However, backward reasoning with Horn clauses, which underpins computation in the logic programming language Prolog, is Turing complete. Moreover, its efficiency is competitive with computation in other symbolic programming languages.[83] + +Fuzzy logic assigns a "degree of truth" between 0 and 1. It can therefore handle propositions that are vague and partially true.[84] + +Non-monotonic logics, including logic programming with negation as failure, are designed to handle default reasoning.[26] Other specialized versions of logic have been developed to describe many complex domains. + +Probabilistic methods for uncertain reasoning + +A simple Bayesian network, with the associated conditional probability tables +Many problems in AI (including reasoning, planning, learning, perception, and robotics) require the agent to operate with incomplete or uncertain information. AI researchers have devised a number of tools to solve these problems using methods from probability theory and economics.[85] Precise mathematical tools have been developed that analyze how an agent can make choices and plan, using decision theory, decision analysis,[86] and information value theory.[87] These tools include models such as Markov decision processes,[88] dynamic decision networks,[89] game theory and mechanism design.[90] + +Bayesian networks[91] are a tool that can be used for reasoning (using the Bayesian inference algorithm),[g][93] learning (using the expectation–maximization algorithm),[h][95] planning (using decision networks)[96] and perception (using dynamic Bayesian networks).[89] + +Probabilistic algorithms can also be used for filtering, prediction, smoothing, and finding explanations for streams of data, thus helping perception systems analyze processes that occur over time (e.g., hidden Markov models or Kalman filters).[89] + + +Expectation–maximization clustering of Old Faithful eruption data starts from a random guess but then successfully converges on an accurate clustering of the two physically distinct modes of eruption. +Classifiers and statistical learning methods +The simplest AI applications can be divided into two types: classifiers (e.g., "if shiny then diamond"), on one hand, and controllers (e.g., "if diamond then pick up"), on the other hand. Classifiers[97] are functions that use pattern matching to determine the closest match. They can be fine-tuned based on chosen examples using supervised learning. Each pattern (also called an "observation") is labeled with a certain predefined class. All the observations combined with their class labels are known as a data set. When a new observation is received, that observation is classified based on previous experience.[44] + +There are many kinds of classifiers in use.[98] The decision tree is the simplest and most widely used symbolic machine learning algorithm.[99] K-nearest neighbor algorithm was the most widely used analogical AI until the mid-1990s, and Kernel methods such as the support vector machine (SVM) displaced k-nearest neighbor in the 1990s.[100] The naive Bayes classifier is reportedly the "most widely used learner"[101] at Google, due in part to its scalability.[102] Neural networks are also used as classifiers.[103] + +Artificial neural networks + +A neural network is an interconnected group of nodes, akin to the vast network of neurons in the human brain. +An artificial neural network is based on a collection of nodes also known as artificial neurons, which loosely model the neurons in a biological brain. It is trained to recognise patterns; once trained, it can recognise those patterns in fresh data. There is an input, at least one hidden layer of nodes and an output. Each node applies a function and once the weight crosses its specified threshold, the data is transmitted to the next layer. A network is typically called a deep neural network if it has at least 2 hidden layers.[103] + +Learning algorithms for neural networks use local search to choose the weights that will get the right output for each input during training. The most common training technique is the backpropagation algorithm.[104] Neural networks learn to model complex relationships between inputs and outputs and find patterns in data. In theory, a neural network can learn any function.[105] + +In feedforward neural networks the signal passes in only one direction.[106] The term perceptron typically refers to a single-layer neural network.[107] In contrast, deep learning uses many layers.[108] Recurrent neural networks (RNNs) feed the output signal back into the input, which allows short-term memories of previous input events. Long short-term memory networks (LSTMs) are recurrent neural networks that better preserve longterm dependencies and are less sensitive to the vanishing gradient problem.[109] Convolutional neural networks (CNNs) use layers of kernels to more efficiently process local patterns. This local processing is especially important in image processing, where the early CNN layers typically identify simple local patterns such as edges and curves, with subsequent layers detecting more complex patterns like textures, and eventually whole objects.[110] + +Deep learning + +Deep learning is a subset of machine learning, which is itself a subset of artificial intelligence.[111] +Deep learning uses several layers of neurons between the network's inputs and outputs.[108] The multiple layers can progressively extract higher-level features from the raw input. For example, in image processing, lower layers may identify edges, while higher layers may identify the concepts relevant to a human such as digits, letters, or faces.[112] + +Deep learning has profoundly improved the performance of programs in many important subfields of artificial intelligence, including computer vision, speech recognition, natural language processing, image classification,[113] and others. The reason that deep learning performs so well in so many applications is not known as of 2021.[114] The sudden success of deep learning in 2012–2015 did not occur because of some new discovery or theoretical breakthrough (deep neural networks and backpropagation had been described by many people, as far back as the 1950s)[i] but because of two factors: the incredible increase in computer power (including the hundred-fold increase in speed by switching to GPUs) and the availability of vast amounts of training data, especially the giant curated datasets used for benchmark testing, such as ImageNet.[j] + +GPT +Generative pre-trained transformers (GPT) are large language models (LLMs) that generate text based on the semantic relationships between words in sentences. Text-based GPT models are pre-trained on a large corpus of text that can be from the Internet. The pretraining consists of predicting the next token (a token being usually a word, subword, or punctuation). Throughout this pretraining, GPT models accumulate knowledge about the world and can then generate human-like text by repeatedly predicting the next token. Typically, a subsequent training phase makes the model more truthful, useful, and harmless, usually with a technique called reinforcement learning from human feedback (RLHF). Current GPT models are prone to generating falsehoods called "hallucinations". These can be reduced with RLHF and quality data, but the problem has been getting worse for reasoning systems.[122] Such systems are used in chatbots, which allow people to ask a question or request a task in simple text.[123][124] + +Current models and services include ChatGPT, Claude, Gemini, Copilot, and Meta AI.[125] Multimodal GPT models can process different types of data (modalities) such as images, videos, sound, and text.[126] + +Hardware and software +Main articles: Programming languages for artificial intelligence and Hardware for artificial intelligence + +Raspberry Pi AI Kit +In the late 2010s, graphics processing units (GPUs) that were increasingly designed with AI-specific enhancements and used with specialized TensorFlow software had replaced previously used central processing unit (CPUs) as the dominant means for large-scale (commercial and academic) machine learning models' training.[127] Specialized programming languages such as Prolog were used in early AI research,[128] but general-purpose programming languages like Python have become predominant.[129] + +The transistor density in integrated circuits has been observed to roughly double every 18 months—a trend known as Moore's law, named after the Intel co-founder Gordon Moore, who first identified it. Improvements in GPUs have been even faster,[130] a trend sometimes called Huang's law,[131] named after Nvidia co-founder and CEO Jensen Huang. + +Applications +Main article: Applications of artificial intelligence + +AI Overviews, an example of AI use on search engines +AI and machine learning technology is used in most of the essential applications of the 2020s, including: + +search engines (such as Google Search) +targeting online advertisements +recommendation systems (offered by Netflix, YouTube or Amazon) driving internet traffic +targeted advertising (AdSense, Facebook) +virtual assistants (such as Siri or Alexa) +autonomous vehicles (including drones, ADAS and self-driving cars) +automatic language translation (Microsoft Translator, Google Translate) +facial recognition (Apple's FaceID or Microsoft's DeepFace and Google's FaceNet) +image labeling (used by Facebook, Apple's Photos and TikTok). +The deployment of AI may be overseen by a chief automation officer (CAO). + +Health and medicine +Main article: Artificial intelligence in healthcare +It has been suggested that AI can overcome discrepancies in funding allocated to different fields of research.[132] + +AlphaFold 2 (2021) demonstrated the ability to approximate, in hours rather than months, the 3D structure of a protein.[133] In 2023, it was reported that AI-guided drug discovery helped find a class of antibiotics capable of killing two different types of drug-resistant bacteria.[134] In 2024, researchers used machine learning to accelerate the search for Parkinson's disease drug treatments. Their aim was to identify compounds that block the clumping, or aggregation, of alpha-synuclein (the protein that characterises Parkinson's disease). They were able to speed up the initial screening process ten-fold and reduce the cost by a thousand-fold.[135][136] + +Gaming +Main article: Artificial intelligence in video games +Game playing programs have been used since the 1950s to demonstrate and test AI's most advanced techniques.[137] Deep Blue became the first computer chess-playing system to beat a reigning world chess champion, Garry Kasparov, on 11 May 1997.[138] In 2011, in a Jeopardy! quiz show exhibition match, IBM's question answering system, Watson, defeated the two greatest Jeopardy! champions, Brad Rutter and Ken Jennings, by a significant margin.[139] In March 2016, AlphaGo won 4 out of 5 games of Go in a match with Go champion Lee Sedol, becoming the first computer Go-playing system to beat a professional Go player without handicaps. Then, in 2017, it defeated Ke Jie, who was the best Go player in the world.[140] Other programs handle imperfect-information games, such as the poker-playing program Pluribus.[141] DeepMind developed increasingly generalistic reinforcement learning models, such as with MuZero, which could be trained to play chess, Go, or Atari games.[142] In 2019, DeepMind's AlphaStar achieved grandmaster level in StarCraft II, a particularly challenging real-time strategy game that involves incomplete knowledge of what happens on the map.[143] In 2021, an AI agent competed in a PlayStation Gran Turismo competition, winning against four of the world's best Gran Turismo drivers using deep reinforcement learning.[144] In 2024, Google DeepMind introduced SIMA, a type of AI capable of autonomously playing nine previously unseen open-world video games by observing screen output, as well as executing short, specific tasks in response to natural language instructions.[145] + +Mathematics +In mathematics, probabilistic large language models are versatile, but can also produce wrong answers in the form of hallucinations. The Alibaba Group developed a version of its Qwen models called Qwen2-Math, that achieved state-of-the-art performance on several mathematical benchmarks, including 84% accuracy on the MATH dataset of competition mathematics problems.[146] In January 2025, Microsoft proposed the technique rStar-Math that leverages Monte Carlo tree search and step-by-step reasoning, enabling a relatively small language model like Qwen-7B to solve 53% of the AIME 2024 and 90% of the MATH benchmark problems.[147] Google DeepMind has developed models for solving mathematical problems: AlphaTensor, AlphaGeometry, AlphaProof and AlphaEvolve.[148][149] + +When natural language is used to describe mathematical problems, converters can transform such prompts into a formal language such as Lean to define mathematical tasks. The experimental model Gemini Deep Think accepts natural language prompts directly and achieved gold medal results in the International Math Olympiad of 2025.[150] + +Topological deep learning integrates various topological approaches. + +Finance +According to Nicolas Firzli, director of the World Pensions & Investments Forum, it may be too early to see the emergence of highly innovative AI-informed financial products and services. He argues that "the deployment of AI tools will simply further automatise things: destroying tens of thousands of jobs in banking, financial planning, and pension advice in the process, but I'm not sure it will unleash a new wave of [e.g., sophisticated] pension innovation."[151] + +Military +Main article: Military applications of artificial intelligence +Various countries are deploying AI military applications.[152] The main applications enhance command and control, communications, sensors, integration and interoperability.[153] Research is targeting intelligence collection and analysis, logistics, cyber operations, information operations, and semiautonomous and autonomous vehicles.[152] AI technologies enable coordination of sensors and effectors, threat detection and identification, marking of enemy positions, target acquisition, coordination and deconfliction of distributed Joint Fires between networked combat vehicles, both human-operated and autonomous.[153] + +AI has been used in military operations in Iraq, Syria, Israel and Ukraine.[152][154][155][156] + +Generative AI +These paragraphs are an excerpt from Generative AI.[edit] +Generative artificial intelligence (GenAI) is a subfield of artificial intelligence (AI) that uses generative models to generate text, images, videos, audio, software code (vibe coding) or other forms of data.[157] These models learn the underlying patterns and structures of their training data, and use them to generate new data[158] in response to input, which often takes the form of natural language prompts.[159][160] + +The prevalence of generative AI tools has increased significantly since the AI boom in the 2020s. This boom was made possible by improvements in deep neural networks, particularly large language models (LLMs), which are based on the transformer architecture. Generative AI applications include chatbots such as ChatGPT, Claude, Copilot, DeepSeek, Google Gemini and Grok; text-to-image models such as DALL-E, Firefly, Stable Diffusion, and Midjourney; and text-to-video models such as Veo, LTX and Sora.[161][162][163] + +Companies in a variety of sectors have used generative AI, including those in software development, healthcare,[164] finance,[165] entertainment,[166] customer service,[167] sales and marketing,[168] art, writing,[169] and product design.[170] + +Agents +Main article: Agentic AI +See also: OpenClaw and CrewAI +AI agents are software entities designed to perceive their environment, make decisions, and take actions autonomously to achieve specific goals. These agents can interact with users, their environment, or other agents. AI agents are used in various applications, including virtual assistants, chatbots, autonomous vehicles, game-playing systems, and industrial robotics. AI agents operate within the constraints of their programming, available computational resources, and hardware limitations. This means they are restricted to performing tasks within their defined scope and have finite memory and processing capabilities. In real-world applications, AI agents often face time constraints for decision-making and action execution. Many AI agents incorporate learning algorithms, enabling them to improve their performance over time through experience or training. Using machine learning, AI agents can adapt to new situations and optimise their behaviour for their designated tasks.[171][172][173] + +Web search +Microsoft introduced Copilot Search in February 2023 under the name Bing Chat. Copilot Search provides AI-generated summaries.[174] + +Google introduced an AI Mode at its Google I/O event on 20 May 2025.[175] + +Sexuality +Applications of AI in this domain include AI-enabled menstruation and fertility trackers that analyze user data to offer predictions,[176] AI-integrated sex toys (e.g., teledildonics),[177] AI-generated sexual education content,[178] and AI agents that simulate sexual and romantic partners (e.g., Replika).[179] AI is also used for the production of non-consensual deepfake pornography, raising significant ethical and legal concerns.[180] + +AI technologies have also been used to attempt to identify online gender-based violence and online sexual grooming of minors.[181][182] + +Other industry-specific tasks +In a 2017 survey, one in five companies reported having incorporated "AI" in some offerings or processes.[183] + +In the field of evacuation and disaster management, AI has been used to investigate patterns in large-scale and small-scale evacuations using historical data from GPS, videos or social media.[184][185][186] + +During the 2024 Indian elections, US$50 million was spent on authorized AI-generated content, notably by creating deepfakes of allied (including sometimes deceased) politicians to better engage with voters, and by translating speeches to various local languages.[187] + +The use of generative AI by law firms for legal research resulted in the creation of the global "AI Hallucination Cases" database, in April 2025, established by HEC Paris and Sciences Po legal data analysis lecturer Damien Charlotin.[188][189] By 2026, judges had issued sanctions and bar associations had issued warnings due to attorney submissions to the courts containing fabricated case law citations hallucinated by AI tools.[190] + +See also: Hallucination (artificial intelligence) § In legal filings +Ethics +Main article: Ethics of artificial intelligence + +Street art in Tel Aviv[191][192] +AI has potential benefits and potential risks.[193] AI may be able to advance science and find solutions for serious problems: Demis Hassabis of DeepMind hopes to "solve intelligence, and then use that to solve everything else".[194] However, as the use of AI has become widespread, several unintended consequences and risks have been identified.[195][196] In-production systems can sometimes not factor ethics and bias into their AI training processes, especially when the AI algorithms are inherently unexplainable in deep learning.[197] + +Risks and harm +Privacy and copyright +Further information: Information privacy and Artificial intelligence and copyright +Machine learning algorithms require large amounts of data. The techniques used to acquire this data have raised concerns about privacy, surveillance and copyright. + +AI-powered devices and services, such as virtual assistants and IoT products, continuously collect personal information, raising concerns about intrusive data gathering and unauthorized access by third parties. The loss of privacy is further exacerbated by AI's ability to process and combine vast amounts of data, potentially leading to a surveillance society where individual activities are constantly monitored and analyzed without adequate safeguards or transparency. + +Sensitive user data collected may include online activity records, geolocation data, video, or audio.[198] For example, in order to build speech recognition algorithms, Amazon has recorded millions of private conversations and allowed temporary workers to listen to and transcribe some of them.[199] Opinions about this widespread surveillance range from those who see it as a necessary evil to those for whom it is clearly unethical and a violation of the right to privacy.[200] + +AI developers argue that this is the only way to deliver valuable applications and have developed several techniques that attempt to preserve privacy while still obtaining the data, such as data aggregation, de-identification and differential privacy.[201] Since 2016, some privacy experts, such as Cynthia Dwork, have begun to view privacy in terms of fairness. Brian Christian wrote that experts have pivoted "from the question of 'what they know' to the question of 'what they're doing with it'."[202] + +Generative AI is often trained on unlicensed copyrighted works, including in domains such as images or computer code; the output is then used under the rationale of "fair use". Experts disagree about how well and under what circumstances this rationale will hold up in courts of law; relevant factors may include "the purpose and character of the use of the copyrighted work" and "the effect upon the potential market for the copyrighted work".[203][204] Website owners can indicate that they do not want their content scraped via a "robots.txt" file.[205] However, some companies will scrape content regardless[206][207] because the robots.txt file has no real authority. In 2023, leading authors (including John Grisham and Jonathan Franzen) sued AI companies for using their work to train generative AI.[208][209] Another discussed approach is to envision a separate sui generis system of protection for creations generated by AI to ensure fair attribution and compensation for human authors.[210] + +Dominance by tech giants +The commercial AI scene is dominated by Big Tech companies such as Alphabet Inc., Amazon, Apple Inc., Meta Platforms, and Microsoft.[211][212][213] Some of these players already own the vast majority of existing cloud infrastructure and computing power from data centers, allowing them to entrench further in the marketplace.[214][215] + +Power needs and environmental impacts +See also: Environmental impacts of artificial intelligence + +Fueled by a growth in AI, data centers' demand for power increased in the 2020s.[216] +Technology companies have built electricity and artificial intelligence infrastructure to facilitate the AI boom of the 2020s. A 2025 report from the consulting firm McKinsey & Company estimated that by 2030, $2.7 trillion would be invested into AI infrastructure and data centers in the US, surpassing World War II's Manhattan Project every month.[217] + +In January 2024, the International Energy Agency (IEA) released Electricity 2024, Analysis and Forecast to 2026.[218] This is the first IEA report to make projections for data centers and power consumption by AI and cryptocurrency. The report states that power demand for these uses might double by 2026, with the additional power consumption equaling that of Japan.[219] + +Power consumption by AI is responsible for an increase in fossil fuel use, and has delayed closings of obsolete, carbon-emitting coal energy facilities. A ChatGPT search involves the use of 10 times the electrical energy as a Google search.[220] + +A 2024 Goldman Sachs Research Paper, AI Data Centers and the Coming US Power Demand Surge, found "US power demand (is) likely to experience growth not seen in a generation...." and forecasts that, by 2030, US data centers will consume 8% of US power, as opposed to 3% in 2022, presaging growth for the electrical power generation industry by a variety of means.[221] Data centers' need for more and more electrical power is such that they might max out the electrical grid. The Big Tech companies counter that AI can be used to maximize the utilization of the grid by all.[222] + +In 2024, The Wall Street Journal reported that big AI companies have begun negotiations with the US nuclear power providers to provide electricity to the data centers. In March 2024 Amazon purchased a Pennsylvania nuclear-powered data center for US$650 million.[223] + +In September 2024, Microsoft announced an agreement with Constellation Energy to re-open the Three Mile Island nuclear power plant to provide Microsoft with 100% of all electric power produced by the plant for 20 years. Reopening the plant, which suffered a partial nuclear meltdown of its Unit 2 reactor in 1979, will require Constellation to get through strict regulatory processes which will include extensive safety scrutiny from the US Nuclear Regulatory Commission. If approved (this will be the first ever US re-commissioning of a nuclear plant), over 835 megawatts of power – enough for 800,000 homes – of energy will be produced. The cost for re-opening and upgrading is estimated at US$1.6 billion and is dependent on tax breaks for nuclear power contained in the 2022 US Inflation Reduction Act.[224] As of 2024, the US government and the state of Michigan have been investing almost US$2 billion to reopen the Palisades Nuclear reactor on Lake Michigan. Closed since 2022, the plant was planned to be reopened in October 2025.[225] + +After the last approval in September 2023, Taiwan suspended the approval of data centers north of Taoyuan with a capacity of more than 5 MW in 2024, due to power supply shortages.[226] Taiwan aims to phase out nuclear power by 2025.[226] + +Singapore imposed a ban on the opening of data centers in 2019 due to electric power, but in 2022, lifted this ban.[226] + +Although most nuclear plants in Japan have been shut down after the 2011 Fukushima nuclear accident, according to an October 2024 Bloomberg article in Japanese, cloud gaming services company Ubitus, in which Nvidia has a stake, is looking for land in Japan near a nuclear power plant for a new data center for generative AI.[227] + +On 1 November 2024, the Federal Energy Regulatory Commission (FERC) rejected an application submitted by Talen Energy for approval to supply some electricity from the nuclear power station Susquehanna to Amazon's data center.[228] According to the Commission Chairman Willie L. Phillips, it is a burden on the electricity grid as well as a significant cost shifting concern to households and other business sectors.[228] + +In 2025, a report prepared by the IEA estimated the greenhouse gas emissions from the energy consumption of AI at 180 million tons. By 2035, these emissions could rise to 300–500 million tonnes depending on what measures will be taken. This is below 1.5% of the energy sector emissions. The emissions reduction potential of AI was estimated at 5% of the energy sector emissions, but rebound effects (for example if people switch from public transport to autonomous cars) can reduce it.[229] + +Misinformation +See also: Content moderation +YouTube, Facebook and others use recommender systems to guide users to more content. These AI programs were given the goal of maximizing user engagement (that is, the only goal was to keep people watching). The AI learned that users tended to choose misinformation, conspiracy theories, and extreme partisan content, and, to keep them watching, the AI recommended more of it. Users also tended to watch more content on the same subject, so the AI led people into filter bubbles where they received multiple versions of the same misinformation.[230] This convinced many users that the misinformation was true, and ultimately undermined trust in institutions, the media and the government.[231] The AI program had correctly learned to maximize its goal, but the result was harmful to society. After the U.S. election in 2016, major technology companies took some steps to mitigate the problem.[232] + +In the early 2020s, generative AI began to create images, audio, and texts that are virtually indistinguishable from real photographs, recordings, or human writing,[233] while realistic AI-generated videos became feasible in the mid-2020s.[234][235][236] It is possible for bad actors to use this technology to create massive amounts of misinformation or propaganda;[237] one such potential malicious use is deepfakes for computational propaganda.[238] AI pioneer and Nobel Prize-winning computer scientist Geoffrey Hinton expressed concern about AI enabling "authoritarian leaders to manipulate their electorates" on a large scale, among other risks.[239] The ability to influence electorates has been proved in at least one study. This same study shows more inaccurate statements from the models when they advocate for candidates of the political right.[240] + +AI researchers at Microsoft, OpenAI, universities and other organisations have suggested using "personhood credentials" as a way to overcome online deception enabled by AI models.[241] + +Algorithmic bias and fairness +Main articles: Algorithmic bias and Fairness (machine learning) +Machine learning applications can be biased[k] if they learn from biased data.[243] The developers may not be aware that the bias exists.[244] Discriminatory behavior by some LLMs can be observed in their output.[245] Bias can be introduced by the way training data is selected and by the way a model is deployed.[246][243] If a biased algorithm is used to make decisions that can seriously harm people (as it can in medicine, finance, recruitment, housing or policing) then the algorithm may cause discrimination.[247] The field of fairness studies how to prevent harms from algorithmic biases. + +On 28 June 2015, Google Photos's new image labeling feature mistakenly identified Jacky Alcine and a friend as "gorillas" because they were black. The system was trained on a dataset that contained very few images of black people,[248] a problem called "sample size disparity".[249] Google "fixed" this problem by preventing the system from labelling anything as a "gorilla". Eight years later, in 2023, Google Photos still could not identify a gorilla, and neither could similar products from Apple, Facebook, Microsoft and Amazon.[250] + +COMPAS is a commercial program widely used by U.S. courts to assess the likelihood of a defendant becoming a recidivist. In 2016, Julia Angwin at ProPublica discovered that COMPAS exhibited racial bias, despite the fact that the program was not told the races of the defendants. Although the error rate for both whites and blacks was calibrated equal at exactly 61%, the errors for each race were different—the system consistently overestimated the chance that a black person would re-offend and would underestimate the chance that a white person would not re-offend.[251] In 2017, several researchers[l] showed that it was mathematically impossible for COMPAS to accommodate all possible measures of fairness when the base rates of re-offense were different for whites and blacks in the data.[253] + +A program can make biased decisions even if the data does not explicitly mention a problematic feature (such as "race" or "gender"). The feature will correlate with other features (like "address", "shopping history" or "first name"), and the program will make the same decisions based on these features as it would on "race" or "gender".[254] Moritz Hardt said "the most robust fact in this research area is that fairness through blindness doesn't work."[255] + +Criticism of COMPAS highlighted that machine learning models are designed to make "predictions" that are only valid if we assume that the future will resemble the past. If they are trained on data that includes the results of racist decisions in the past, machine learning models must predict that racist decisions will be made in the future. If an application then uses these predictions as recommendations, some of these "recommendations" will likely be racist.[256] Thus, machine learning is not well suited to help make decisions in areas where there is hope that the future will be better than the past. It is descriptive rather than prescriptive.[m] + +Bias and unfairness may go undetected because the developers are overwhelmingly white and male: among AI engineers, about 4% are black and 20% are women.[249] + +There are various conflicting definitions and mathematical models of fairness. These notions depend on ethical assumptions, and are influenced by beliefs about society. One broad category is distributive fairness, which focuses on the outcomes, often identifying groups and seeking to compensate for statistical disparities. Representational fairness tries to ensure that AI systems do not reinforce negative stereotypes or render certain groups invisible. Procedural fairness focuses on the decision process rather than the outcome. The most relevant notions of fairness may depend on the context, notably the type of AI application and the stakeholders. The subjectivity in the notions of bias and fairness makes it difficult for companies to operationalize them. Having access to sensitive attributes such as race or gender is also considered by many AI ethicists to be necessary in order to compensate for biases, but it may conflict with anti-discrimination laws.[242] + +At the 2022 ACM Conference on Fairness, Accountability, and Transparency a paper reported that a CLIP‑based (Contrastive Language-Image Pre-training) robotic system reproduced harmful gender‑ and race‑linked stereotypes in a simulated manipulation task. The authors recommended robot‑learning methods which physically manifest such harms be "paused, reworked, or even wound down when appropriate, until outcomes can be proven safe, effective, and just."[258][259][260] + +Lack of transparency +See also: Explainable AI, Algorithmic transparency, and Right to explanation +Many AI systems are so complex that their designers cannot explain how they reach their decisions.[261] Particularly with deep neural networks, in which there are many non-linear relationships between inputs and outputs. But some popular explainability techniques exist.[262] + +It is impossible to be certain that a program is operating correctly if no one knows how exactly it works. There have been many cases where a machine learning program passed rigorous tests, but nevertheless learned something different than what the programmers intended. For example, a system that could identify skin diseases better than medical professionals was found to actually have a strong tendency to classify images with a ruler as "cancerous", because pictures of malignancies typically include a ruler to show the scale.[263] Another machine learning system designed to help effectively allocate medical resources was found to classify patients with asthma as being at "low risk" of dying from pneumonia. Having asthma is actually a severe risk factor, but since the patients having asthma would usually get much more medical care, they were relatively unlikely to die according to the training data. The correlation between asthma and low risk of dying from pneumonia was real, but misleading.[264] + +People who have been harmed by an algorithm's decision have a right to an explanation.[265] Doctors, for example, are expected to clearly and completely explain to their colleagues the reasoning behind any decision they make. Early drafts of the European Union's General Data Protection Regulation in 2016 included an explicit statement that this right exists.[n] Industry experts noted that this is an unsolved problem with no solution in sight. Regulators argued that nevertheless the harm is real: if the problem has no solution, the tools should not be used.[266] + +DARPA established the XAI ("Explainable Artificial Intelligence") program in 2014 to try to solve these problems.[267] + +Several approaches aim to address the transparency problem. SHAP enables to visualise the contribution of each feature to the output.[268] LIME can locally approximate a model's outputs with a simpler, interpretable model.[269] Multitask learning provides a large number of outputs in addition to the target classification. These other outputs can help developers deduce what the network has learned.[270] Deconvolution, DeepDream and other generative methods can allow developers to see what different layers of a deep network for computer vision have learned, and produce output that can suggest what the network is learning.[271] For generative pre-trained transformers, Anthropic developed a technique based on dictionary learning that associates patterns of neuron activations with human-understandable concepts.[272] + +Bad actors and weaponized AI +Main articles: Lethal autonomous weapon, Artificial intelligence arms race, and AI safety +Artificial intelligence provides a number of tools that are useful to bad actors, such as authoritarian governments, terrorists, criminals or rogue states. + +A lethal autonomous weapon is a machine that locates, selects and engages human targets without human supervision.[o] Widely available AI tools can be used by bad actors to develop inexpensive autonomous weapons and, if produced at scale, they are potentially weapons of mass destruction.[274] Even when used in conventional warfare, they currently cannot reliably choose targets and could potentially kill an innocent person.[274] In 2014, 30 nations (including China) supported a ban on autonomous weapons under the United Nations' Convention on Certain Conventional Weapons, however the United States and others disagreed.[275] By 2015, over fifty countries were reported to be researching battlefield robots.[276] + +AI tools make it easier for authoritarian governments to efficiently control their citizens in several ways. Face and voice recognition allow widespread surveillance. Machine learning, operating this data, can classify potential enemies of the state and prevent them from hiding. Recommendation systems can precisely target propaganda and misinformation for maximum effect. Deepfakes and generative AI aid in producing misinformation. Advanced AI can make authoritarian centralized decision-making more competitive than liberal and decentralized systems such as markets. It lowers the cost and difficulty of digital warfare and advanced spyware.[277] All these technologies have been available since 2020 or earlier—AI facial recognition systems are already being used for mass surveillance in China.[278][279] + +There are many other ways in which AI is expected to help bad actors, some of which can not be foreseen. For example, machine-learning AI is able to design tens of thousands of toxic molecules in a matter of hours.[280] + +Technological unemployment +Main articles: Workplace impact of artificial intelligence and Technological unemployment +Economists have frequently highlighted the risks of redundancies from AI, and speculated about unemployment if there is no adequate social policy for full employment.[281] + +In the past, technology has tended to increase rather than reduce total employment, but economists acknowledge that "we're in uncharted territory" with AI.[282] A survey of economists showed disagreement about whether the increasing use of robots and AI will cause a substantial increase in long-term unemployment, but they generally agree that it could be a net benefit if productivity gains are redistributed.[283] Risk estimates vary; for example, in the 2010s, Michael Osborne and Carl Benedikt Frey estimated 47% of U.S. jobs are at "high risk" of potential automation, while an OECD report classified only 9% of U.S. jobs as "high risk".[p][285] The methodology of speculating about future employment levels has been criticised as lacking evidential foundation, and for implying that technology, rather than social policy, creates unemployment, as opposed to redundancies.[281] In April 2023, it was reported that 70% of the jobs for Chinese video game illustrators had been eliminated by generative artificial intelligence.[286][287] Early-career workers showed decreasing employment rates in some AI-exposed occupations.[288] + +Unlike previous waves of automation, many middle-class jobs may be eliminated by artificial intelligence; The Economist stated in 2015 that "the worry that AI could do to white-collar jobs what steam power did to blue-collar ones during the Industrial Revolution" is "worth taking seriously".[289] Jobs at extreme risk range from paralegals to fast food cooks, while job demand is likely to increase for care-related professions ranging from personal healthcare to the clergy.[290] In July 2025, Ford CEO Jim Farley predicted that "artificial intelligence is going to replace literally half of all white-collar workers in the U.S."[291] + +From the early days of the development of artificial intelligence, there have been arguments, for example, those put forward by Joseph Weizenbaum, about whether tasks that can be done by computers actually should be done by them, given the difference between computers and humans, and between quantitative calculation and qualitative, value-based judgement.[292] + +Substitution for human–human interaction +See also: Deaths linked to chatbots +With the increase of loneliness in the early 21st century, AI is sometimes identified as a potential source of relief to this problem. It would be possible, via human-like qualities built into AI products,[293] for individuals to assume that this need can be met by artificial means.[294][295] In some cases, people approach artificial intelligence for companionship when they believe that they would not find acceptance due to feeling outcast.[296] Examples of harm coming to humans from advanced chatbots have been reported in courts in the United States, with AI companies accused of creating products that endanger humans through emotional confusion or deception.[297][298] + +Existential risk +Main article: Existential risk from artificial intelligence +Recent public debates in artificial intelligence have increasingly focused on its broader societal and ethical implications. It has been argued AI will become so powerful that humanity may irreversibly lose control of it. This could, as physicist Stephen Hawking stated, "spell the end of the human race".[299] This scenario has been common in science fiction, when a computer or robot suddenly develops a human-like "self-awareness" (or "sentience" or "consciousness") and becomes a malevolent character.[q] These sci-fi scenarios are misleading in several ways. + +First, AI does not require human-like sentience to be an existential risk. Modern AI programs are given specific goals and use learning and intelligence to achieve them. Philosopher Nick Bostrom argued that if one gives almost any goal to a sufficiently powerful AI, it may choose to destroy humanity to achieve it (he used the example of an automated paperclip factory that destroys the world to get more iron for paperclips).[301] Stuart Russell gives the example of household robot that tries to find a way to kill its owner to prevent it from being unplugged, reasoning that "you can't fetch the coffee if you're dead."[302] In order to be safe for humanity, a superintelligence would have to be genuinely aligned with humanity's morality and values so that it is "fundamentally on our side".[303] + +Second, Yuval Noah Harari argues that AI does not require a robot body or physical control to pose an existential risk. The essential parts of civilization are not physical. Things like ideologies, law, government, money and the economy are built on language; they exist because there are stories that billions of people believe. The current prevalence of misinformation suggests that an AI could use language to convince people to believe anything, even to take actions that are destructive.[304] Geoffrey Hinton said in 2025 that modern AI is particularly "good at persuasion" and getting better all the time. He asks "Suppose you wanted to invade the capital of the US. Do you have to go there and do it yourself? No. You just have to be good at persuasion."[305] + +The opinions amongst experts and industry insiders are mixed, with sizable fractions both concerned and unconcerned by risk from eventual superintelligent AI.[306] Personalities such as Stephen Hawking, Bill Gates, and Elon Musk,[307] as well as AI pioneers such as Geoffrey Hinton, Yoshua Bengio, Stuart Russell, Demis Hassabis, and Sam Altman, have expressed concerns about existential risk from AI. + +In May 2023, Geoffrey Hinton announced his resignation from Google in order to be able to "freely speak out about the risks of AI" without "considering how this impacts Google".[308] He notably mentioned risks of an AI takeover,[309] and stressed that in order to avoid the worst outcomes, establishing safety guidelines will require cooperation among those competing in use of AI.[310] + +In 2023, many leading AI experts endorsed the joint statement that "Mitigating the risk of extinction from AI should be a global priority alongside other societal-scale risks such as pandemics and nuclear war".[311] + +Some other researchers were more optimistic. AI pioneer Jürgen Schmidhuber did not sign the joint statement, emphasising that in 95% of all cases, AI research is about making "human lives longer and healthier and easier."[312] While the tools that are now being used to improve lives can also be used by bad actors, "they can also be used against the bad actors."[313][314] Andrew Ng also argued that "it's a mistake to fall for the doomsday hype on AI—and that regulators who do will only benefit vested interests."[315] Yann LeCun, a Turing Award winner, disagreed with the idea that AI will subordinate humans "simply because they are smarter, let alone destroy [us]",[316] "scoff[ing] at his peers' dystopian scenarios of supercharged misinformation and even, eventually, human extinction." In contrast, he claimed that "intelligent machines will usher in a new renaissance for humanity, a new era of enlightenment."[317] In the early 2010s, experts argued that the risks are too distant in the future to warrant research or that humans will be valuable from the perspective of a superintelligent machine.[318] However, after 2016, the study of current and future risks and possible solutions became a serious area of research.[319] + +Ethical machines and alignment +Main articles: Machine ethics, AI safety, Friendly artificial intelligence, Artificial moral agents, and Human Compatible +See also: Human-AI interaction +Friendly AI are machines that have been designed from the beginning to minimize risks and to make choices that benefit humans. Eliezer Yudkowsky, who coined the term, argues that developing friendly AI should be a higher research priority: it may require a large investment and it must be completed before AI becomes an existential risk.[320] + +Machines with intelligence have the potential to use their intelligence to make ethical decisions. The field of machine ethics provides machines with ethical principles and procedures for resolving ethical dilemmas.[321] The field of machine ethics is also called computational morality,[321] and was founded at an AAAI symposium in 2005.[322] + +Other approaches include Wendell Wallach's "artificial moral agents"[323] and Stuart J. Russell's three principles for developing provably beneficial machines.[324] + +Open source +See also: Open-source artificial intelligence and Lists of open-source artificial intelligence software +Active organizations in the AI open-source community include Hugging Face,[325] Google,[326] EleutherAI and Meta.[327] Various AI models, such as Llama 2, Mistral or Stable Diffusion, have been made open-weight,[328][329] meaning that their architecture and trained parameters (the "weights") are publicly available. Open-weight models can be freely fine-tuned, which allows companies to specialize them with their own data and for their own use-case.[330] Open-weight models are useful for research and innovation but can also be misused. Since they can be fine-tuned, any built-in security measure, such as objecting to harmful requests, can be trained away until it becomes ineffective. Some researchers warn that future AI models may develop dangerous capabilities (such as the potential to drastically facilitate bioterrorism) and that once released on the Internet, they cannot be deleted everywhere if needed. They recommend pre-release audits and cost-benefit analyses.[331] diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step1_add_resource.py b/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step1_add_resource.py similarity index 77% rename from benchmark/retrieval/grep/vikingdb_bm25/step1_add_resource.py rename to benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step1_add_resource.py index ed88719ded..11b04e9c91 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/step1_add_resource.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step1_add_resource.py @@ -1,10 +1,15 @@ #!/usr/bin/env python3 -"""Step 1: Import local code directories as benchmark data via OpenViking SDK. +"""Step 1 (Effectiveness): Import real code repos into OpenViking (no indexing). Recursively scans a local directory, imports each subdirectory (at all depths) -separately via SyncOpenViking.add_resource (wait=True), and saves progress -after each directory for resumability. Directory order is deterministic -(sorted at each level). +via SyncOpenViking.add_resource (wait=True, build_index=False, summarize=False), +and saves progress after each directory for resumability. + +After all imports are done, run step2_reindex.py to build vector indexes, +then step3_quality.py to evaluate retrieval quality. + +Prerequisites: + - Download code repos and place them under the source directory manually. Usage: python3 step1_add_resource.py @@ -20,31 +25,25 @@ from openviking.sync_client import SyncOpenViking DEFAULT_SOURCE = os.path.expanduser("~/.openviking/data/benchmark/OpenViking-main") -PROGRESS_FILE = os.path.expanduser("~/.openviking/data/benchmark/.code-import-progress") -BENCHMARK_PARENT = "viking://resources/benchmark" +PROGRESS_FILE = os.path.expanduser("~/.openviking/data/benchmark/.effectiveness-import-progress") +BENCHMARK_PARENT = "viking://resources/benchmark/effectiveness" def load_progress() -> set[str]: - """Load completed directory names from progress file.""" if not os.path.exists(PROGRESS_FILE): return set() with open(PROGRESS_FILE) as f: return {line.strip() for line in f if line.strip()} -def save_progress(dir_name: str) -> None: - """Append a completed directory name to progress file.""" +def save_progress(rel_dir: str) -> None: os.makedirs(os.path.dirname(PROGRESS_FILE), exist_ok=True) with open(PROGRESS_FILE, "a") as f: - f.write(dir_name + "\n") + f.write(rel_dir + "\n") def scan_subdirs_recursive(root: str) -> list[str]: - """Return sorted list of all subdirectory relative paths under root (recursive, deterministic order). - - Skips hidden directories (starting with '.'). Order is deterministic: - sorted at each level, parent before children. - """ + """Return sorted list of all subdirectory relative paths (deterministic order).""" result: list[str] = [] def _walk(dir_path: str, rel_prefix: str) -> None: @@ -68,7 +67,7 @@ def _walk(dir_path: str, rel_prefix: str) -> None: def main(): parser = argparse.ArgumentParser( - description="Step 1: Import local code directories as benchmark data via SDK" + description="Step 1 (Effectiveness): Import real code repos (no indexing)" ) parser.add_argument( "--source", @@ -88,14 +87,14 @@ def main(): return print("=" * 80) - print("Step 1: Import Local Code Directories as Benchmark Data") + print("Step 1 (Effectiveness): Import Code Repos (no VLM/embedding)") print("=" * 80) print(f" Source: {source}") print(f" Parent: {args.parent}") print(f" Progress: {PROGRESS_FILE}") + print(" Indexing: DISABLED (build_index=False, summarize=False)") print() - # Scan subdirectories recursively subdirs = scan_subdirs_recursive(source) total = len(subdirs) print(f" Total directories to import: {total}") @@ -105,7 +104,6 @@ def main(): print("No subdirectories found. Nothing to import.") return - # Load progress completed = load_progress() if completed: already_done = [d for d in subdirs if d in completed] @@ -122,7 +120,6 @@ def main(): continue dir_path = os.path.join(source, rel_dir) - # Build parent URI: viking://resources/benchmark/ parent_rel = os.path.dirname(rel_dir) parent_uri = f"{args.parent}/{parent_rel}" if parent_rel else args.parent print(f" [{i}/{total}] Importing: {rel_dir} ...", end="", flush=True) @@ -132,9 +129,11 @@ def main(): result = client.add_resource( path=dir_path, parent=parent_uri, - reason=f"benchmark data: {rel_dir}", + reason=f"benchmark effectiveness: {rel_dir}", wait=True, create_parent=True, + build_index=False, + summarize=False, ) elapsed = time.monotonic() - t0 root_uri = result.get("root_uri", "?") @@ -155,7 +154,6 @@ def main(): client.close() - # Summary print() print("Summary:") for r in results: @@ -174,11 +172,12 @@ def main(): print() if total_done >= total and failed_count == 0: - print(f"All {total} directories imported and processed successfully.") - print("Next step: run step2_benchmark.py to measure grep performance") + print(f"All {total} directories imported successfully (no indexing).") + print("Next step: run step2_reindex.py to build vector indexes") else: print( - f" Imported: {ok_count} Failed: {failed_count} Skipped: {skipped_count} Remaining: {total - total_done}" + f" Imported: {ok_count} Failed: {failed_count} " + f"Skipped: {skipped_count} Remaining: {total - total_done}" ) if failed_count > 0: print("Re-run this script to resume from where it left off.") diff --git a/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step2_reindex.py b/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step2_reindex.py new file mode 100644 index 0000000000..775c43da7c --- /dev/null +++ b/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step2_reindex.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 +"""Step 2 (Effectiveness): Build vector indexes for imported code repos. + +Submits async reindex tasks for each first-level subdirectory via +SyncHTTPClient.reindex(wait=False), with a concurrency limit of 2 +running tasks. When a task completes, the next one is submitted. +This avoids tree-lock conflicts and prevents resource exhaustion. + +Prerequisites: + 1. Run step1_add_resource.py to import repos (without indexing) + 2. Start openviking-server manually + +Usage: + python3 step2_reindex.py +""" + +from __future__ import annotations + +import argparse +import os +import time + +from openviking_cli.client.sync_http import SyncHTTPClient + +DEFAULT_SOURCE = os.path.expanduser("~/.openviking/data/benchmark/OpenViking-main") +PROGRESS_FILE = os.path.expanduser("~/.openviking/data/benchmark/.effectiveness-reindex-progress") +BENCHMARK_PARENT = "viking://resources/benchmark/effectiveness" + +POLL_INTERVAL = 5 # seconds between task status checks +MAX_CONCURRENT = 16 # max running tasks at a time + + +def load_progress() -> set[str]: + if not os.path.exists(PROGRESS_FILE): + return set() + with open(PROGRESS_FILE) as f: + return {line.strip() for line in f if line.strip()} + + +def save_progress(rel_dir: str) -> None: + os.makedirs(os.path.dirname(PROGRESS_FILE), exist_ok=True) + with open(PROGRESS_FILE, "a") as f: + f.write(rel_dir + "\n") + + +def scan_first_level_dirs(root: str) -> list[str]: + """Return sorted list of first-level subdirectory names.""" + try: + entries = sorted(os.listdir(root)) + except OSError: + return [] + return [e for e in entries if not e.startswith(".") and os.path.isdir(os.path.join(root, e))] + + +def main(): + parser = argparse.ArgumentParser( + description="Step 2 (Effectiveness): Build vector indexes via openviking-server" + ) + parser.add_argument( + "--source", + default=DEFAULT_SOURCE, + help=f"Local source directory (must match step1, default: {DEFAULT_SOURCE})", + ) + parser.add_argument( + "--parent", + default=BENCHMARK_PARENT, + help=f"Parent Viking URI (default: {BENCHMARK_PARENT})", + ) + parser.add_argument( + "--concurrency", + type=int, + default=MAX_CONCURRENT, + help=f"Max concurrent reindex tasks (default: {MAX_CONCURRENT})", + ) + args = parser.parse_args() + + source = os.path.expanduser(args.source) + max_concurrent = max(1, args.concurrency) + + print("=" * 80) + print("Step 2 (Effectiveness): Build Vector Indexes (via openviking-server)") + print("=" * 80) + print(f" Source: {source}") + print(f" Parent: {args.parent}") + print(f" Progress: {PROGRESS_FILE}") + print(" Mode: vectors_only (wait=False, async)") + print(f" Concurrency: {max_concurrent}") + print() + print(" Prerequisite: openviking-server must be running!") + print() + + # Scan first-level dirs only + first_level = scan_first_level_dirs(source) + total = len(first_level) + print(f" First-level directories to reindex: {total}") + print() + + if total == 0: + print("No subdirectories found. Run step1_add_resource.py first.") + return + + completed = load_progress() + if completed: + already_done = [d for d in first_level if d in completed] + print(f" Resuming: {len(already_done)} directories already reindexed") + print() + + client = SyncHTTPClient(account="default", user="default") + client.initialize() + + # Build work queue (skip already completed) + work_queue: list[str] = [name for name in first_level if name not in completed] + skipped_count = len(first_level) - len(work_queue) + + # running: task_id -> (name, submit_time) + running: dict[str, tuple[str, float]] = {} + results: list[dict] = [] + + def _submit_next() -> bool: + """Submit the next item from work_queue if slot available. Returns True if submitted.""" + if not work_queue or len(running) >= max_concurrent: + return False + name = work_queue.pop(0) + dir_uri = f"{args.parent}/{name}" + idx = total - len(work_queue) + print(f" [{idx}/{total}] Submitting: {name} ...", end="", flush=True) + try: + result = client.reindex(uri=dir_uri, mode="vectors_only", wait=False) + task_id = result.get("task_id", "") + if task_id: + print(f" task_id={task_id[:8]}...") + running[task_id] = (name, time.monotonic()) + else: + print(" completed synchronously") + save_progress(name) + results.append({"dir": name, "status": "ok", "elapsed_s": 0.0}) + return True + except Exception as e: + print(f" FAILED: {e}") + results.append({"dir": name, "status": "failed", "error": str(e)[:500]}) + return True + + # Fill initial slots + while len(running) < max_concurrent and work_queue: + _submit_next() + + if not running and not results: + client.close() + _print_summary(results, skipped_count, first_level) + return + + # Poll loop: check running tasks, submit new ones as slots free up + print() + print(f" Running {len(running)} tasks, {len(work_queue)} queued") + print() + + while running: + done_ids = [] + for task_id, (name, submit_time) in list(running.items()): + try: + task_info = client.get_task(task_id) + except Exception: + continue + if task_info is None: + continue + status = task_info.get("status", "") + if status in ("completed", "failed"): + elapsed = time.monotonic() - submit_time + if status == "completed": + print(f" DONE {name} ({elapsed:.1f}s)") + save_progress(name) + results.append({"dir": name, "status": "ok", "elapsed_s": round(elapsed, 1)}) + else: + error = task_info.get("error", "unknown error") + print(f" FAIL {name} ({elapsed:.1f}s): {error}") + results.append( + { + "dir": name, + "status": "failed", + "elapsed_s": round(elapsed, 1), + "error": error, + } + ) + done_ids.append(task_id) + + for tid in done_ids: + del running[tid] + + # Fill freed slots + while len(running) < max_concurrent and work_queue: + _submit_next() + + if running: + time.sleep(POLL_INTERVAL) + + client.close() + _print_summary(results, skipped_count, first_level) + + +def _print_summary(results: list[dict], skipped_count: int, all_dirs: list[str]) -> None: + print() + print("Summary:") + ok_count = sum(1 for r in results if r.get("status") == "ok") + failed_count = sum(1 for r in results if r.get("status") == "failed") + total_done = skipped_count + ok_count + + for r in results: + status = r.get("status", "unknown") + line = f" {status.upper():>7s} {r.get('dir', '?')}" + if "elapsed_s" in r: + line += f" ({r['elapsed_s']}s)" + if status == "failed": + line += f" -- {r.get('error', '')}" + print(line) + + print() + total = len(all_dirs) + if total_done >= total and failed_count == 0: + print(f"All {total} directories reindexed successfully.") + print("Next step: run step3_quality.py to evaluate retrieval quality") + else: + print( + f" Reindexed: {ok_count} Failed: {failed_count} " + f"Skipped: {skipped_count} Remaining: {total - total_done}" + ) + if failed_count > 0: + print("Re-run this script to resume from where it left off.") + + +if __name__ == "__main__": + main() diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step3_retrieval_quality.py b/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step3_quality.py similarity index 69% rename from benchmark/retrieval/grep/vikingdb_bm25/step3_retrieval_quality.py rename to benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step3_quality.py index a4aae34a8f..80500415fe 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/step3_retrieval_quality.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step3_quality.py @@ -1,23 +1,19 @@ #!/usr/bin/env python3 -"""Step 3: Retrieval quality evaluation — compare auto (bm25) vs fs grep. +"""Step 3 (Effectiveness): Evaluate retrieval quality for real code repos. -Prerequisites: - 1. Run step1_add_resource.py to import repos (includes VLM+embedding) - 2. Ensure ov.conf has: - "grep": {"engine": "auto", "switch_to_remote_threshold": 0} - (switch_to_remote_threshold = 0 forces VikingDB BM25 for all queries) - 3. Restart the server after changing ov.conf +Compares SDK grep results against local regex ground truth. +Computes Recall, Precision, F1 per query pattern. -Approach: - - Ground truth: scan local repo files with Python regex (equivalent to fs engine) - - Test: call SDK grep to get structured results - - Compare: compute Recall, Precision, F1 per query pattern +Prerequisites: + 1. Run step1_add_resource.py to import repos (no indexing) + 2. Run step2_reindex.py to build vector indexes + 3. Ensure ov.conf has the desired grep engine config KEYWORDS: Fill the KEYWORDS list below with real terms from the imported -repos. Each keyword will be tested for retrieval quality. +repos before running. Usage: - python3 step3_retrieval_quality.py + python3 step3_quality.py """ from __future__ import annotations @@ -28,23 +24,19 @@ from openviking.sync_client import SyncOpenViking -BASE_URI = "viking://resources/benchmark" -DATA_DIR = os.path.expanduser("~/.openviking/data") +BASE_URI = "viking://resources/benchmark/effectiveness" +DATA_DIR = os.path.expanduser("~/.openviking/data/benchmark") KEYWORDS: list[str] = [] def build_test_patterns() -> list[tuple[str, str]]: patterns = [] - for kw in KEYWORDS: patterns.append((f"keyword: {kw}", kw)) - if len(KEYWORDS) >= 2: patterns.append((f"multi 2: {KEYWORDS[0]}|{KEYWORDS[1]}", f"{KEYWORDS[0]}|{KEYWORDS[1]}")) - - patterns.append(("no-match: zzz_nonexistent_benchmark", "zzz_nonexistent_benchmark")) - + patterns.append(("no-match: zzz_nonexistent_quality", "zzz_nonexistent_quality")) return patterns @@ -52,7 +44,6 @@ def run_sdk_grep(client: SyncOpenViking, uri: str, pattern: str) -> tuple[set[st t0 = time.monotonic() result = client.grep(uri=uri, pattern=pattern, node_limit=100000) elapsed = time.monotonic() - t0 - uris = set() if isinstance(result, dict): for match in result.get("matches", []): @@ -119,7 +110,6 @@ def compute_metrics(truth: set[str], predicted: set[str]) -> dict: return {"recall": 1.0, "precision": 1.0, "f1": 1.0, "tp": 0, "fp": 0, "fn": 0} if not truth: return {"recall": 0.0, "precision": 0.0, "f1": 0.0, "tp": 0, "fp": len(predicted), "fn": 0} - tp = len(truth & predicted) fp = len(predicted - truth) fn = len(truth - predicted) @@ -140,39 +130,31 @@ def main(): if not KEYWORDS: print("WARNING: KEYWORDS list is empty. Fill it with real terms before running.") - print(" Edit step3_retrieval_quality.py and add keywords to the KEYWORDS list.\n") + print(" Edit step2_quality.py and add keywords to the KEYWORDS list.\n") test_patterns = build_test_patterns() print("=" * 110) - print("Retrieval Quality Evaluation: auto (bm25+fs) vs local fs (ground truth)") + print("Effectiveness Evaluation: SDK grep vs local regex (ground truth)") print("=" * 110) print(f"URI: {uri}") print(f"Data dir: {DATA_DIR}/benchmark/") - print(f"Local dirs: {search_dirs}") print(f"Patterns: {len(test_patterns)}") print() - print("Ensure ov.conf has:") - print(' "grep": {"engine": "auto", "switch_to_remote_threshold": 0}') - print("And the server has been restarted.") - print() results = [] - client = SyncOpenViking() client.initialize() try: for label, pattern in test_patterns: print(f"--- {label} (pattern: {pattern}) ---") - truth_uris, fs_elapsed = compute_ground_truth(pattern, search_dirs) print(f" Ground truth (local fs): {len(truth_uris)} matches ({fs_elapsed:.2f}s)") - try: auto_uris, auto_elapsed = run_sdk_grep(client, uri, pattern) except Exception as e: - print(f" Auto grep FAILED: {e}") + print(f" SDK grep FAILED: {e}") results.append( { "label": label, @@ -182,7 +164,7 @@ def main(): } ) continue - print(f" Auto grep (bm25+fs): {len(auto_uris)} matches ({auto_elapsed:.2f}s)") + print(f" SDK grep: {len(auto_uris)} matches ({auto_elapsed:.2f}s)") metrics = compute_metrics(truth_uris, auto_uris) print( @@ -195,20 +177,14 @@ def main(): if metrics["fp"] > 0: print(f" Extra (FP): {metrics['fp']}") - if metrics["fn"] > 0: - missed = sorted(truth_uris - auto_uris)[:5] - print(" Sample missed URIs:") - for u in missed: - print(f" {u}") - results.append( { "label": label, "pattern": pattern, "truth_count": len(truth_uris), - "auto_count": len(auto_uris), + "found_count": len(auto_uris), "fs_elapsed_s": round(fs_elapsed, 3), - "auto_elapsed_s": round(auto_elapsed, 3), + "sdk_elapsed_s": round(auto_elapsed, 3), **metrics, } ) @@ -218,7 +194,7 @@ def main(): print() print("=" * 120) print( - f"{'Label':<45} {'Truth':>6} {'Auto':>6} {'Recall':>8} {'Prec':>8} {'F1':>8} {'Missed':>8}" + f"{'Label':<45} {'Truth':>6} {'Found':>6} {'Recall':>8} {'Prec':>8} {'F1':>8} {'Missed':>8}" ) print("-" * 120) for r in results: @@ -229,30 +205,11 @@ def main(): ) else: print( - f"{r['label']:<45} {r['truth_count']:>6} {r['auto_count']:>6} " + f"{r['label']:<45} {r['truth_count']:>6} {r['found_count']:>6} " f"{r['recall']:>8.4f} {r['precision']:>8.4f} {r['f1']:>8.4f} {r['fn']:>8}" ) print() - has_recall_loss = any(r.get("fn", 0) > 0 for r in results) - has_precision_loss = any(r.get("fp", 0) > 0 for r in results) - if not has_recall_loss and not has_precision_loss: - print( - "VERDICT: All queries achieved perfect recall and precision. bm25 recall is complete." - ) - else: - if has_recall_loss: - print("VERDICT: Recall loss detected — some files not recalled by bm25.") - print( - " Possible causes: content field truncation, tokenizer mismatch, or incomplete reindex." - ) - if has_precision_loss: - print("VERDICT: Precision loss detected — unexpected matches in auto results.") - print( - " This should not happen (phase 2 regex guarantees precision). Investigate URI format." - ) - print() - if __name__ == "__main__": main() diff --git a/benchmark/retrieval/grep/vikingdb_bm25/performance/step0_prepare_data.py b/benchmark/retrieval/grep/vikingdb_bm25/performance/step0_prepare_data.py new file mode 100644 index 0000000000..b8118278ae --- /dev/null +++ b/benchmark/retrieval/grep/vikingdb_bm25/performance/step0_prepare_data.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python3 +"""Step 0 (Performance): Prepare synthetic benchmark data for grep testing. + +Reads a source text file (ai_wiki.txt), replicates it across configurable +directories and files, and injects target words at specified probabilities +for retrieval testing. + +Directory layout: + /dir_000/wiki_000.txt ... dir_000/wiki_999.txt + /dir_001/wiki_000.txt ... dir_001/wiki_999.txt + ... + +Each dir_xxx contains 1000 files. Default: 100 directories (100,000 files). + +Target words are injected by replacing a random word in the text. +All target words must NOT exist in the original source text. + +Default target words and probabilities (12 words, 4 tiers): + 50% : quantumnexus, synapseflow, deepvector + 10% : bm25engine, vikingcore, retrievex + 0.1% : zephyrhash, cryptolattice, nebulalink + 0.01% : xenoform, quarkpulse, omegabind + +Usage: + python3 step0_prepare_data.py + python3 step0_prepare_data.py --num-dirs 50 --seed 42 +""" + +from __future__ import annotations + +import argparse +import os +import random +import re +import time + +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +DEFAULT_SOURCE = os.path.join(SCRIPT_DIR, "..", "ai_wiki.txt") +DEFAULT_OUTPUT = os.path.expanduser("~/.openviking/data/benchmark/synthetic") +FILES_PER_DIR = 1000 + +TARGET_WORDS = { + 0.50: ["quantumnexus", "synapseflow", "deepvector"], + 0.10: ["bm25engine", "vikingcore", "retrievex"], + 0.001: ["zephyrhash", "cryptolattice", "nebulalink"], + 0.0001: ["xenoform", "quarkpulse", "omegabind"], +} + + +def verify_target_words(text: str) -> None: + """Verify that no target word appears in the source text.""" + text_lower = text.lower() + conflicts = [] + for prob, words in TARGET_WORDS.items(): + for word in words: + if word.lower() in text_lower: + conflicts.append((word, prob)) + if conflicts: + for word, prob in conflicts: + print(f" CONFLICT: target word '{word}' (p={prob}) found in source text!") + raise ValueError( + f"{len(conflicts)} target word(s) already exist in source text. " + "Choose different target words." + ) + + +def inject_word(text: str, target: str) -> str: + """Replace a random word in the text with the target word.""" + words = list(re.finditer(r"\b\w+\b", text)) + if not words: + return text + match = random.choice(words) + return text[: match.start()] + target + text[match.end() :] + + +def generate_dataset( + source_text: str, + output_dir: str, + num_dirs: int, + seed: int = 42, +) -> dict: + rng = random.Random(seed) + total_files = num_dirs * FILES_PER_DIR + injection_stats = {word: 0 for prob_words in TARGET_WORDS.values() for word in prob_words} + + print(f" Generating {num_dirs} dirs x {FILES_PER_DIR} files = {total_files} files") + print(f" Output: {output_dir}") + print() + + t0 = time.monotonic() + + for dir_idx in range(num_dirs): + dir_name = f"dir_{dir_idx:03d}" + dir_path = os.path.join(output_dir, dir_name) + os.makedirs(dir_path, exist_ok=True) + + for file_idx in range(FILES_PER_DIR): + file_name = f"wiki_{file_idx:03d}.txt" + file_path = os.path.join(dir_path, file_name) + content = source_text + + for prob, words in TARGET_WORDS.items(): + for word in words: + if rng.random() < prob: + content = inject_word(content, word) + injection_stats[word] += 1 + + with open(file_path, "w") as f: + f.write(content) + + if (dir_idx + 1) % 10 == 0 or dir_idx == num_dirs - 1: + elapsed = time.monotonic() - t0 + print(f" [{dir_idx + 1}/{num_dirs}] dirs created ({elapsed:.1f}s)") + + elapsed = time.monotonic() - t0 + return { + "total_files": total_files, + "num_dirs": num_dirs, + "files_per_dir": FILES_PER_DIR, + "elapsed_s": round(elapsed, 1), + "injection_stats": injection_stats, + } + + +def main(): + parser = argparse.ArgumentParser( + description="Step 0 (Performance): Prepare synthetic benchmark data" + ) + parser.add_argument( + "--source", default=DEFAULT_SOURCE, help=f"Source text file (default: {DEFAULT_SOURCE})" + ) + parser.add_argument( + "--output", default=DEFAULT_OUTPUT, help=f"Output directory (default: {DEFAULT_OUTPUT})" + ) + parser.add_argument( + "--num-dirs", type=int, default=100, help="Number of directories (default: 100)" + ) + parser.add_argument("--seed", type=int, default=42, help="Random seed (default: 42)") + args = parser.parse_args() + + source = os.path.normpath(os.path.expanduser(args.source)) + output = os.path.expanduser(args.output) + + print("=" * 80) + print("Step 0 (Performance): Prepare Synthetic Benchmark Data") + print("=" * 80) + print(f" Source: {source}") + print(f" Output: {output}") + print(f" Dirs: {args.num_dirs}") + print(f" Seed: {args.seed}") + print() + + if not os.path.isfile(source): + print(f"ERROR: Source file not found: {source}") + return + + with open(source) as f: + source_text = f.read() + + print(f" Source text size: {len(source_text):,} chars") + print() + print(" Verifying target words against source text...") + try: + verify_target_words(source_text) + except ValueError as e: + print(f"ERROR: {e}") + return + print(" All target words verified OK.") + print() + print(" Target words and injection probabilities:") + for prob in sorted(TARGET_WORDS.keys(), reverse=True): + words = TARGET_WORDS[prob] + pct = f"{prob * 100:.2f}%" + print(f" {pct:>8s} : {', '.join(words)}") + print() + + summary = generate_dataset(source_text, output, args.num_dirs, seed=args.seed) + + print() + print("=" * 80) + print("Summary:") + print(f" Total files: {summary['total_files']:,}") + print(f" Directories: {summary['num_dirs']}") + print(f" Elapsed: {summary['elapsed_s']}s") + print() + print(" Target word injection counts:") + total_files = summary["total_files"] + for prob in sorted(TARGET_WORDS.keys(), reverse=True): + words = TARGET_WORDS[prob] + for word in words: + actual = summary["injection_stats"][word] + expected = total_files * prob + pct = actual / total_files * 100 if total_files > 0 else 0 + print( + f" {word:<18s} actual={actual:>6d} " + f"expected~{expected:>8.1f} " + f"rate={pct:.3f}% (target={prob * 100:.2f}%)" + ) + + print() + print(f"Data ready at: {output}") + print("Next: run step1_add_resource.py to import into OpenViking") + + +if __name__ == "__main__": + main() diff --git a/benchmark/retrieval/grep/vikingdb_bm25/performance/step1_add_resource.py b/benchmark/retrieval/grep/vikingdb_bm25/performance/step1_add_resource.py new file mode 100644 index 0000000000..3d5a56a08b --- /dev/null +++ b/benchmark/retrieval/grep/vikingdb_bm25/performance/step1_add_resource.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 +"""Step 1 (Performance): Import synthetic data into OpenViking WITHOUT indexing. + +Imports each directory recursively via SyncOpenViking.add_resource with +build_index=False and summarize=False, to skip slow VLM/embedding steps. +Progress is saved after each directory for resumability. + +After all imports are done, run step2_reindex.py to build vector indexes, +then step3_benchmark.py to measure performance. + +Usage: + python3 step1_add_resource.py + python3 step1_add_resource.py --source ~/.openviking/data/benchmark/synthetic +""" + +from __future__ import annotations + +import argparse +import os +import time + +from openviking.sync_client import SyncOpenViking + +DEFAULT_SOURCE = os.path.expanduser("~/.openviking/data/benchmark/synthetic") +PROGRESS_FILE = os.path.expanduser("~/.openviking/data/benchmark/.perf-import-progress") +BENCHMARK_PARENT = "viking://resources/benchmark/performance" + + +def load_progress() -> set[str]: + if not os.path.exists(PROGRESS_FILE): + return set() + with open(PROGRESS_FILE) as f: + return {line.strip() for line in f if line.strip()} + + +def save_progress(rel_dir: str) -> None: + os.makedirs(os.path.dirname(PROGRESS_FILE), exist_ok=True) + with open(PROGRESS_FILE, "a") as f: + f.write(rel_dir + "\n") + + +def scan_subdirs_recursive(root: str) -> list[str]: + """Return sorted list of all subdirectory relative paths (deterministic order).""" + result: list[str] = [] + + def _walk(dir_path: str, rel_prefix: str) -> None: + try: + entries = sorted(os.listdir(dir_path)) + except OSError: + return + for name in entries: + if name.startswith("."): + continue + full = os.path.join(dir_path, name) + if not os.path.isdir(full): + continue + rel = f"{rel_prefix}/{name}" if rel_prefix else name + result.append(rel) + _walk(full, rel) + + _walk(root, "") + return result + + +def main(): + parser = argparse.ArgumentParser( + description="Step 1 (Performance): Import synthetic data (no indexing)" + ) + parser.add_argument( + "--source", + default=DEFAULT_SOURCE, + help=f"Local directory to import (default: {DEFAULT_SOURCE})", + ) + parser.add_argument( + "--parent", + default=BENCHMARK_PARENT, + help=f"Parent Viking URI (default: {BENCHMARK_PARENT})", + ) + args = parser.parse_args() + + source = os.path.expanduser(args.source) + if not os.path.isdir(source): + print(f"ERROR: Source directory does not exist: {source}") + return + + print("=" * 80) + print("Step 1 (Performance): Import Synthetic Data (no VLM/embedding)") + print("=" * 80) + print(f" Source: {source}") + print(f" Parent: {args.parent}") + print(f" Progress: {PROGRESS_FILE}") + print(" Indexing: DISABLED (build_index=False, summarize=False)") + print() + + subdirs = scan_subdirs_recursive(source) + total = len(subdirs) + print(f" Total directories to import: {total}") + print() + + if total == 0: + print("No subdirectories found. Nothing to import.") + return + + completed = load_progress() + if completed: + already_done = [d for d in subdirs if d in completed] + print(f" Resuming: {len(already_done)} directories already imported") + print() + + client = SyncOpenViking() + client.initialize() + + results = [] + for i, rel_dir in enumerate(subdirs, 1): + if rel_dir in completed: + print(f" [{i}/{total}] SKIP (already done): {rel_dir}") + continue + + dir_path = os.path.join(source, rel_dir) + parent_rel = os.path.dirname(rel_dir) + parent_uri = f"{args.parent}/{parent_rel}" if parent_rel else args.parent + print(f" [{i}/{total}] Importing: {rel_dir} ...", end="", flush=True) + + t0 = time.monotonic() + try: + result = client.add_resource( + path=dir_path, + parent=parent_uri, + reason=f"benchmark perf: {rel_dir}", + wait=True, + create_parent=True, + build_index=False, + summarize=False, + ) + elapsed = time.monotonic() - t0 + root_uri = result.get("root_uri", "?") + print(f" OK ({elapsed:.1f}s) -> {root_uri}") + save_progress(rel_dir) + results.append({"dir": rel_dir, "status": "ok", "elapsed_s": round(elapsed, 1)}) + except Exception as e: + elapsed = time.monotonic() - t0 + print(f" FAILED ({elapsed:.1f}s): {e}") + results.append( + { + "dir": rel_dir, + "status": "failed", + "elapsed_s": round(elapsed, 1), + "error": str(e)[:500], + } + ) + + client.close() + + print() + print("Summary:") + ok_count = sum(1 for r in results if r["status"] == "ok") + failed_count = sum(1 for r in results if r["status"] == "failed") + skipped_count = sum(1 for d in subdirs if d in completed) + total_done = skipped_count + ok_count + + for r in results: + status = r["status"] + line = f" {status.upper():>7s} {r['dir']} ({r['elapsed_s']}s)" + if status == "failed": + line += f" -- {r.get('error', '')}" + print(line) + + print() + if total_done >= total and failed_count == 0: + print(f"All {total} directories imported successfully (no indexing).") + print("Next step: run step2_reindex.py to build vector indexes") + else: + print( + f" Imported: {ok_count} Failed: {failed_count} " + f"Skipped: {skipped_count} Remaining: {total - total_done}" + ) + if failed_count > 0: + print("Re-run this script to resume from where it left off.") + + +if __name__ == "__main__": + main() diff --git a/benchmark/retrieval/grep/vikingdb_bm25/performance/step2_reindex.py b/benchmark/retrieval/grep/vikingdb_bm25/performance/step2_reindex.py new file mode 100644 index 0000000000..1caff8161f --- /dev/null +++ b/benchmark/retrieval/grep/vikingdb_bm25/performance/step2_reindex.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 +"""Step 2 (Performance): Build vector indexes for imported data. + +Submits async reindex tasks for each first-level subdirectory via +SyncHTTPClient.reindex(wait=False), with a concurrency limit of 2 +running tasks. When a task completes, the next one is submitted. +This avoids tree-lock conflicts and prevents resource exhaustion. + +Prerequisites: + 1. Run step1_add_resource.py to import data (without indexing) + 2. Start openviking-server manually + +Usage: + python3 step2_reindex.py +""" + +from __future__ import annotations + +import argparse +import os +import time + +from openviking_cli.client.sync_http import SyncHTTPClient + +DEFAULT_SOURCE = os.path.expanduser("~/.openviking/data/benchmark/synthetic") +PROGRESS_FILE = os.path.expanduser("~/.openviking/data/benchmark/.perf-reindex-progress") +BENCHMARK_PARENT = "viking://resources/benchmark/performance" + +POLL_INTERVAL = 5 # seconds between task status checks +MAX_CONCURRENT = 16 # max running tasks at a time + + +def load_progress() -> set[str]: + if not os.path.exists(PROGRESS_FILE): + return set() + with open(PROGRESS_FILE) as f: + return {line.strip() for line in f if line.strip()} + + +def save_progress(rel_dir: str) -> None: + os.makedirs(os.path.dirname(PROGRESS_FILE), exist_ok=True) + with open(PROGRESS_FILE, "a") as f: + f.write(rel_dir + "\n") + + +def scan_first_level_dirs(root: str) -> list[str]: + """Return sorted list of first-level subdirectory names.""" + try: + entries = sorted(os.listdir(root)) + except OSError: + return [] + return [e for e in entries if not e.startswith(".") and os.path.isdir(os.path.join(root, e))] + + +def main(): + parser = argparse.ArgumentParser( + description="Step 2 (Performance): Build vector indexes via openviking-server" + ) + parser.add_argument( + "--source", + default=DEFAULT_SOURCE, + help=f"Local source directory (must match step1, default: {DEFAULT_SOURCE})", + ) + parser.add_argument( + "--parent", + default=BENCHMARK_PARENT, + help=f"Parent Viking URI (default: {BENCHMARK_PARENT})", + ) + parser.add_argument( + "--concurrency", + type=int, + default=MAX_CONCURRENT, + help=f"Max concurrent reindex tasks (default: {MAX_CONCURRENT})", + ) + args = parser.parse_args() + + source = os.path.expanduser(args.source) + max_concurrent = max(1, args.concurrency) + + print("=" * 80) + print("Step 2 (Performance): Build Vector Indexes (via openviking-server)") + print("=" * 80) + print(f" Source: {source}") + print(f" Parent: {args.parent}") + print(f" Progress: {PROGRESS_FILE}") + print(" Mode: vectors_only (wait=False, async)") + print(f" Concurrency: {max_concurrent}") + print() + print(" Prerequisite: openviking-server must be running!") + print() + + # Scan first-level dirs only + first_level = scan_first_level_dirs(source) + total = len(first_level) + print(f" First-level directories to reindex: {total}") + print() + + if total == 0: + print("No subdirectories found. Run step1_add_resource.py first.") + return + + completed = load_progress() + if completed: + already_done = [d for d in first_level if d in completed] + print(f" Resuming: {len(already_done)} directories already reindexed") + print() + + client = SyncHTTPClient(account="default", user="default") + client.initialize() + + # Build work queue (skip already completed) + work_queue: list[str] = [name for name in first_level if name not in completed] + skipped_count = len(first_level) - len(work_queue) + + # running: task_id -> (name, submit_time) + running: dict[str, tuple[str, float]] = {} + results: list[dict] = [] + + def _submit_next() -> bool: + """Submit the next item from work_queue if slot available. Returns True if submitted.""" + if not work_queue or len(running) >= max_concurrent: + return False + name = work_queue.pop(0) + dir_uri = f"{args.parent}/{name}" + idx = total - len(work_queue) + print(f" [{idx}/{total}] Submitting: {name} ...", end="", flush=True) + try: + result = client.reindex(uri=dir_uri, mode="vectors_only", wait=False) + task_id = result.get("task_id", "") + if task_id: + print(f" task_id={task_id[:8]}...") + running[task_id] = (name, time.monotonic()) + else: + print(" completed synchronously") + save_progress(name) + results.append({"dir": name, "status": "ok", "elapsed_s": 0.0}) + return True + except Exception as e: + print(f" FAILED: {e}") + results.append({"dir": name, "status": "failed", "error": str(e)[:500]}) + return True + + # Fill initial slots + while len(running) < max_concurrent and work_queue: + _submit_next() + + if not running and not results: + client.close() + _print_summary(results, skipped_count, first_level) + return + + # Poll loop: check running tasks, submit new ones as slots free up + print() + print(f" Running {len(running)} tasks, {len(work_queue)} queued") + print() + + while running: + done_ids = [] + for task_id, (name, submit_time) in list(running.items()): + try: + task_info = client.get_task(task_id) + except Exception: + continue + if task_info is None: + continue + status = task_info.get("status", "") + if status in ("completed", "failed"): + elapsed = time.monotonic() - submit_time + if status == "completed": + print(f" DONE {name} ({elapsed:.1f}s)") + save_progress(name) + results.append({"dir": name, "status": "ok", "elapsed_s": round(elapsed, 1)}) + else: + error = task_info.get("error", "unknown error") + print(f" FAIL {name} ({elapsed:.1f}s): {error}") + results.append( + { + "dir": name, + "status": "failed", + "elapsed_s": round(elapsed, 1), + "error": error, + } + ) + done_ids.append(task_id) + + for tid in done_ids: + del running[tid] + + # Fill freed slots + while len(running) < max_concurrent and work_queue: + _submit_next() + + if running: + time.sleep(POLL_INTERVAL) + + client.close() + _print_summary(results, skipped_count, first_level) + + +def _print_summary(results: list[dict], skipped_count: int, all_dirs: list[str]) -> None: + print() + print("Summary:") + ok_count = sum(1 for r in results if r.get("status") == "ok") + failed_count = sum(1 for r in results if r.get("status") == "failed") + total_done = skipped_count + ok_count + + for r in results: + status = r.get("status", "unknown") + line = f" {status.upper():>7s} {r.get('dir', '?')}" + if "elapsed_s" in r: + line += f" ({r['elapsed_s']}s)" + if status == "failed": + line += f" -- {r.get('error', '')}" + print(line) + + print() + total = len(all_dirs) + if total_done >= total and failed_count == 0: + print(f"All {total} directories reindexed successfully.") + print("Next step: run step3_benchmark.py to measure performance") + else: + print( + f" Reindexed: {ok_count} Failed: {failed_count} " + f"Skipped: {skipped_count} Remaining: {total - total_done}" + ) + if failed_count > 0: + print("Re-run this script to resume from where it left off.") + + +if __name__ == "__main__": + main() diff --git a/benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py b/benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py new file mode 100644 index 0000000000..b8725c5b6e --- /dev/null +++ b/benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python3 +"""Step 3 (Performance): Benchmark grep performance and recall. + +Runs grep queries against the synthetic dataset, measuring both latency +and recall. Target words from step0 are used as test queries, with +expected hit counts computed from the injection probabilities. + +Run twice with different ov.conf engine settings to compare: + 1. Set ov.conf: "grep": {"engine": "fs"}, restart, then: + python3 step3_benchmark.py --engine-label fs + 2. Set ov.conf: "grep": {"engine": "auto", "switch_to_remote_threshold": 0}, restart, then: + python3 step3_benchmark.py --engine-label auto --compare step3_result_fs.json + +Results are saved to step3_result_{engine_label}.json. +""" + +from __future__ import annotations + +import argparse +import json +import os +import time + +from openviking.sync_client import SyncOpenViking + +BASE_URI = "viking://resources/benchmark/performance" +DATA_DIR = os.path.expanduser("~/.openviking/data/benchmark/synthetic") + +# Same target words as step0_prepare_data.py +TARGET_WORDS = { + 0.50: ["quantumnexus", "synapseflow", "deepvector"], + 0.10: ["bm25engine", "vikingcore", "retrievex"], + 0.001: ["zephyrhash", "cryptolattice", "nebulalink"], + 0.0001: ["xenoform", "quarkpulse", "omegabind"], +} + +RUNS = 3 +WARMUP = 1 + + +def count_local_files() -> int: + """Count total .txt files in the synthetic dataset.""" + count = 0 + if not os.path.isdir(DATA_DIR): + return 0 + for root, _dirs, files in os.walk(DATA_DIR): + for f in files: + if f.endswith(".txt"): + count += 1 + return count + + +def run_grep(client: SyncOpenViking, pattern: str, uri: str) -> tuple[float, int]: + start = time.monotonic() + result = client.grep(uri=uri, pattern=pattern, node_limit=100000) + elapsed = time.monotonic() - start + match_count = 0 + if isinstance(result, dict): + matches = result.get("matches", []) + match_count = len(matches) + return elapsed, match_count + + +def benchmark_engine(client: SyncOpenViking, total_files: int) -> list[dict]: + results = [] + + for prob in sorted(TARGET_WORDS.keys(), reverse=True): + words = TARGET_WORDS[prob] + for word in words: + expected = int(total_files * prob) + label = f"{word} (p={prob * 100:.2f}%, expect~{expected})" + + print(f" {label} ...", end=" ", flush=True) + + # Warmup + for _ in range(WARMUP): + try: + run_grep(client, word, BASE_URI) + except Exception: + pass + + # Benchmark runs + times = [] + match_count = 0 + failed = False + for _ in range(RUNS): + try: + elapsed, matches = run_grep(client, word, BASE_URI) + times.append(elapsed) + match_count = matches + except Exception as e: + failed = True + print(f"FAILED ({e})") + break + + if failed: + results.append({"label": label, "word": word, "probability": prob, "error": True}) + else: + avg_ms = sum(times) / len(times) * 1000 + min_ms = min(times) * 1000 + max_ms = max(times) * 1000 + # Recall: how many of the expected files were found + # This is approximate since injection is probabilistic + recall = match_count / expected if expected > 0 else 1.0 + print( + f"avg={avg_ms:.1f}ms matches={match_count} expected~{expected} recall~{recall:.2f}" + ) + results.append( + { + "label": label, + "word": word, + "probability": prob, + "avg_ms": round(avg_ms, 1), + "min_ms": round(min_ms, 1), + "max_ms": round(max_ms, 1), + "matches": match_count, + "expected_approx": expected, + "recall_approx": round(recall, 4), + } + ) + + # No-match test + label = "no-match: zzz_nonexistent_perf" + print(f" {label} ...", end=" ", flush=True) + for _ in range(WARMUP): + try: + run_grep(client, "zzz_nonexistent_perf", BASE_URI) + except Exception: + pass + times = [] + match_count = 0 + failed = False + for _ in range(RUNS): + try: + elapsed, matches = run_grep(client, "zzz_nonexistent_perf", BASE_URI) + times.append(elapsed) + match_count = matches + except Exception as e: + failed = True + print(f"FAILED ({e})") + break + if failed: + results.append({"label": label, "word": "zzz_nonexistent_perf", "error": True}) + else: + avg_ms = sum(times) / len(times) * 1000 + min_ms = min(times) * 1000 + max_ms = max(times) * 1000 + print(f"avg={avg_ms:.1f}ms matches={match_count}") + results.append( + { + "label": label, + "word": "zzz_nonexistent_perf", + "avg_ms": round(avg_ms, 1), + "min_ms": round(min_ms, 1), + "max_ms": round(max_ms, 1), + "matches": match_count, + } + ) + + return results + + +def print_comparison( + current_label: str, current: list[dict], compare_label: str, compare: list[dict] +): + compare_by_word = {} + for r in compare: + if "error" not in r and "word" in r: + compare_by_word[r["word"]] = r + + print() + print("=" * 120) + print(f" Comparison: {compare_label} vs {current_label}") + print("=" * 120) + print( + f"{'Word':<20} {'Prob':>8} {compare_label + '(ms)':>14} {current_label + '(ms)':>14} {'speedup':>10} {'Cmp matches':>12} {'Cur matches':>12}" + ) + print("-" * 120) + + for r in current: + if "error" in r: + print(f"{r.get('word', '?'):<20} {'ERR':>8} {'ERR':>14} {'ERR':>14} {'---':>10}") + continue + word = r["word"] + cur_ms = r["avg_ms"] + cmp = compare_by_word.get(word) + if not cmp: + print( + f"{word:<20} {r.get('probability', 0) * 100:>7.2f}% {'N/A':>14} {cur_ms:>14.1f} {'---':>10}" + ) + continue + cmp_ms = cmp["avg_ms"] + speedup = cmp_ms / cur_ms if cur_ms > 0 else float("inf") + speedup_str = f"{speedup:.1f}x" + print( + f"{word:<20} {r.get('probability', 0) * 100:>7.2f}% " + f"{cmp_ms:>14.1f} {cur_ms:>14.1f} {speedup_str:>10} " + f"{cmp.get('matches', '?'):>12} {r.get('matches', '?'):>12}" + ) + print() + + +def main(): + parser = argparse.ArgumentParser(description="Step 3 (Performance): Benchmark grep") + parser.add_argument( + "--engine-label", + required=True, + help="Label for this engine config (e.g. fs, auto). Used in output filename.", + ) + parser.add_argument( + "--compare", + default=None, + help="Path to a previous step3_result_*.json for comparison", + ) + args = parser.parse_args() + + total_files = count_local_files() + + print("=" * 80) + print(f"Step 3 (Performance): Grep Benchmark — engine={args.engine_label}") + print("=" * 80) + print(f" URI: {BASE_URI}") + print(f" Total files: {total_files:,}") + print(f" Runs per test: {RUNS} (warmup: {WARMUP})") + print() + print("Ensure ov.conf has the desired grep config and the server is restarted.") + print() + + client = SyncOpenViking() + client.initialize() + + try: + results = benchmark_engine(client, total_files) + finally: + client.close() + + output_file = f"step3_result_{args.engine_label}.json" + with open(output_file, "w") as f: + json.dump( + {"engine_label": args.engine_label, "total_files": total_files, "results": results}, + f, + indent=2, + ) + print(f"\nResults saved to {output_file}") + + print() + print( + f"{'Word':<20} {'Prob':>8} {'Avg(ms)':>10} {'Min(ms)':>10} {'Max(ms)':>10} {'Matches':>10} {'Expect~':>10} {'Recall~':>10}" + ) + print("-" * 108) + for r in results: + if "error" in r: + print(f"{r.get('word', '?'):<20} {'FAILED':>10}") + else: + print( + f"{r['word']:<20} {r.get('probability', 0) * 100:>7.2f}% " + f"{r['avg_ms']:>10.1f} {r['min_ms']:>10.1f} " + f"{r['max_ms']:>10.1f} {r['matches']:>10} " + f"{r.get('expected_approx', '?'):>10} {r.get('recall_approx', '?'):>10}" + ) + print() + + if args.compare: + if not os.path.isfile(args.compare): + print(f"Warning: compare file not found: {args.compare}") + else: + with open(args.compare) as f: + prev = json.load(f) + prev_label = prev.get("engine_label", "previous") + prev_results = prev.get("results", []) + print_comparison(args.engine_label, results, prev_label, prev_results) + + +if __name__ == "__main__": + main() diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step2_benchmark.py b/benchmark/retrieval/grep/vikingdb_bm25/step2_benchmark.py deleted file mode 100644 index 8d7519d2a1..0000000000 --- a/benchmark/retrieval/grep/vikingdb_bm25/step2_benchmark.py +++ /dev/null @@ -1,234 +0,0 @@ -#!/usr/bin/env python3 -"""Step 2: Benchmark grep performance for the current engine config. - -Prerequisites: - 1. Run step1_add_resource.py to import repos (includes VLM+embedding) - 2. Set ov.conf grep config and restart the server - -NOTE: `engine` and `switch_to_remote_threshold` are server-side config -(ov.conf `grep` section). To benchmark different engines, update ov.conf -and restart the server before each run. - -KEYWORDS: Fill the KEYWORDS list below with real terms from the imported -repos. Each keyword will be tested individually, plus multi-keyword regex -and no-match scenarios. - -Usage: - # Run 1: benchmark with fs engine - # 1. Set ov.conf: "grep": {"engine": "fs"} - # 2. Restart server - python3 step2_benchmark.py --engine-label fs - - # Run 2: benchmark with auto engine (bm25) - # 1. Set ov.conf: "grep": {"engine": "auto", "switch_to_remote_threshold": 0} - # 2. Restart server - python3 step2_benchmark.py --engine-label auto --compare step2_result_fs.json - -Results are saved to step2_result_{engine_label}.json. -When --compare is given, a side-by-side comparison table is printed. -""" - -from __future__ import annotations - -import argparse -import json -import os -import time - -from openviking.sync_client import SyncOpenViking - -BASE_URI = "viking://resources/benchmark" -RUNS = 3 -WARMUP = 1 - -KEYWORDS: list[str] = [] - - -def build_test_cases() -> list[tuple[str, str, str]]: - cases = [] - - for kw in KEYWORDS: - cases.append((f"keyword: {kw}", kw, BASE_URI)) - - if len(KEYWORDS) >= 2: - cases.append( - (f"multi 2: {KEYWORDS[0]}|{KEYWORDS[1]}", f"{KEYWORDS[0]}|{KEYWORDS[1]}", BASE_URI) - ) - if len(KEYWORDS) >= 3: - cases.append( - ( - f"multi 3: {KEYWORDS[0]}|{KEYWORDS[1]}|{KEYWORDS[2]}", - f"{KEYWORDS[0]}|{KEYWORDS[1]}|{KEYWORDS[2]}", - BASE_URI, - ) - ) - - cases.append(("no-match: zzz_nonexistent_benchmark", "zzz_nonexistent_benchmark", BASE_URI)) - cases.append(("no-match 2: zzz_a|zzz_b", "zzz_a|zzz_b", BASE_URI)) - - return cases - - -def run_grep(client: SyncOpenViking, pattern: str, uri: str) -> tuple[float, int]: - start = time.monotonic() - result = client.grep(uri=uri, pattern=pattern, node_limit=100000) - elapsed = time.monotonic() - start - - match_count = 0 - if isinstance(result, dict): - matches = result.get("matches", []) - match_count = len(matches) - - return elapsed, match_count - - -def benchmark_engine(client: SyncOpenViking, engine_label: str) -> list[dict]: - test_cases = build_test_cases() - results = [] - - for label, pattern, uri in test_cases: - print(f" {label} ...", end=" ", flush=True) - - for _ in range(WARMUP): - try: - run_grep(client, pattern, uri) - except Exception: - pass - - times = [] - match_count = 0 - failed = False - for _ in range(RUNS): - try: - elapsed, matches = run_grep(client, pattern, uri) - times.append(elapsed) - match_count = matches - except Exception as e: - failed = True - print(f"FAILED ({e})") - break - - if failed: - results.append({"label": label, "pattern": pattern, "uri": uri, "error": True}) - else: - avg_ms = sum(times) / len(times) * 1000 - min_ms = min(times) * 1000 - max_ms = max(times) * 1000 - print(f"avg={avg_ms:.1f}ms min={min_ms:.1f}ms matches={match_count}") - results.append( - { - "label": label, - "pattern": pattern, - "uri": uri, - "avg_ms": round(avg_ms, 1), - "min_ms": round(min_ms, 1), - "max_ms": round(max_ms, 1), - "matches": match_count, - } - ) - - return results - - -def print_comparison( - current_label: str, current: list[dict], compare_label: str, compare: list[dict] -): - compare_by_label = {} - for r in compare: - if "error" not in r: - compare_by_label[r["label"]] = r - - print() - print("=" * 110) - print(f" Comparison: {compare_label} vs {current_label}") - print("=" * 110) - print( - f"{'Label':<50} {compare_label + '(ms)':>12} {current_label + '(ms)':>12} {'speedup':>10}" - ) - print("-" * 110) - - for r in current: - label = r["label"] - if "error" in r: - print(f"{label:<50} {'ERR':>12} {'ERR':>12} {'---':>10}") - continue - cur_ms = r["avg_ms"] - cmp = compare_by_label.get(label) - if not cmp: - print(f"{label:<50} {'N/A':>12} {cur_ms:>12.1f} {'---':>10}") - continue - cmp_ms = cmp["avg_ms"] - if cur_ms > 0: - speedup = cmp_ms / cur_ms - speedup_str = f"{speedup:.1f}x" - else: - speedup_str = "inf" - print(f"{label:<50} {cmp_ms:>12.1f} {cur_ms:>12.1f} {speedup_str:>10}") - - print() - - -def main(): - parser = argparse.ArgumentParser(description="Benchmark grep performance") - parser.add_argument( - "--engine-label", - required=True, - help="Label for this engine config (e.g. fs, auto). Used in output filename.", - ) - parser.add_argument( - "--compare", - default=None, - help="Path to a previous step2_result_*.json file for side-by-side comparison", - ) - args = parser.parse_args() - - if not KEYWORDS: - print("WARNING: KEYWORDS list is empty. Fill it with real terms before running.") - print(" Edit step2_benchmark.py and add keywords to the KEYWORDS list.\n") - - print("=" * 80) - print(f"Step 2: Grep Performance Benchmark — engine={args.engine_label}") - print("=" * 80) - print() - print("Ensure ov.conf has the desired grep config and the server is restarted.") - print() - - client = SyncOpenViking() - client.initialize() - - try: - results = benchmark_engine(client, args.engine_label) - finally: - client.close() - - output_file = f"step2_result_{args.engine_label}.json" - with open(output_file, "w") as f: - json.dump({"engine_label": args.engine_label, "results": results}, f, indent=2) - print(f"\nResults saved to {output_file}") - - print() - print(f"{'Label':<50} {'Avg(ms)':>10} {'Min(ms)':>10} {'Max(ms)':>10} {'Matches':>10}") - print("-" * 95) - for r in results: - if "error" in r: - print(f"{r['label']:<50} {'FAILED':>10}") - else: - print( - f"{r['label']:<50} {r['avg_ms']:>10.1f} {r['min_ms']:>10.1f} " - f"{r['max_ms']:>10.1f} {r['matches']:>10}" - ) - print() - - if args.compare: - if not os.path.isfile(args.compare): - print(f"Warning: compare file not found: {args.compare}") - else: - with open(args.compare) as f: - prev = json.load(f) - prev_label = prev.get("engine_label", "previous") - prev_results = prev.get("results", []) - print_comparison(args.engine_label, results, prev_label, prev_results) - - -if __name__ == "__main__": - main() From 6a807452f794f0afd07e24290fa4add2d1acd9da Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Mon, 1 Jun 2026 16:01:12 +0800 Subject: [PATCH 16/31] optimize (benchmark): adjust keywords and ground truth for testing --- .../retrieval/grep/vikingdb_bm25/README.md | 34 ++- .../retrieval/grep/vikingdb_bm25/README_CN.md | 34 ++- .../effectiveness/step3_quality.py | 268 ++++++++++++------ .../performance/step3_benchmark.py | 168 +++++++++-- 4 files changed, 371 insertions(+), 133 deletions(-) diff --git a/benchmark/retrieval/grep/vikingdb_bm25/README.md b/benchmark/retrieval/grep/vikingdb_bm25/README.md index d1f3a15b8e..9c70d998e6 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/README.md +++ b/benchmark/retrieval/grep/vikingdb_bm25/README.md @@ -27,8 +27,8 @@ Tests whether grep can find **all** matching files in real code repositories. | Step | Script | Description | |------|--------|-------------| | 1 | `step1_add_resource.py` | Import code repos (no indexing, fast) | -| 2 | `step2_reindex.py` | Async reindex via openviking-server (concurrency=2, polling) | -| 3 | `step3_quality.py` | Compare SDK grep vs local regex ground truth | +| 2 | `step2_reindex.py` | Async reindex via openviking-server (concurrency=16, polling) | +| 3 | `step3_quality.py` | Compare grep results vs ground truth (fs engine, cached) | ### Usage @@ -39,10 +39,20 @@ python3 step1_add_resource.py --source ~/.openviking/data/benchmark/OpenViking-m # Step 2: Build vector indexes (requires openviking-server running) python3 step2_reindex.py -# Optional: --concurrency N (default: 2) +# Optional: --concurrency N (default: 16) -# Step 3: Edit KEYWORDS list in step3_quality.py, then run -python3 step3_quality.py +# Step 3: Evaluate retrieval quality +# First run MUST use engine=fs in ov.conf to generate ground truth cache: +# 1. Set ov.conf: "grep": {"engine": "fs"} +# 2. Restart server +python3 step3_quality.py --keywords grep reindex SyncHTTPClient + +# Subsequent runs can use any engine (ground truth is read from cache): +# 1. Set ov.conf: "grep": {"engine": "auto", "switch_to_remote_threshold": 0} +# 2. Restart server +python3 step3_quality.py --keywords grep reindex SyncHTTPClient + +# Optional: --regenerate-ground-truth (force recompute, requires engine=fs) ``` ## Performance — Latency & Recall at Scale @@ -55,8 +65,8 @@ Tests grep speed and recall on a large synthetic dataset (default: 100K files). |------|--------|-------------| | 0 | `step0_prepare_data.py` | Generate synthetic dataset (dir_xxx/wiki_xxx.txt) | | 1 | `step1_add_resource.py` | Import data (no VLM/embedding, fast) | -| 2 | `step2_reindex.py` | Async reindex via openviking-server (concurrency=2, polling) | -| 3 | `step3_benchmark.py` | Measure latency and recall | +| 2 | `step2_reindex.py` | Async reindex via openviking-server (concurrency=16, polling) | +| 3 | `step3_benchmark.py` | Measure latency and recall (ground truth from fs engine, cached) | ### Target Words @@ -82,10 +92,10 @@ python3 step1_add_resource.py # Step 2: Build vector indexes (requires openviking-server running) python3 step2_reindex.py -# Optional: --concurrency N (default: 2) +# Optional: --concurrency N (default: 16) # Step 3: Benchmark — run with different engine configs -# Run A: fs engine +# Run A: fs engine (also generates ground truth cache on first run) # 1. Set ov.conf: "grep": {"engine": "fs"} # 2. Restart server python3 step3_benchmark.py --engine-label fs @@ -98,8 +108,10 @@ python3 step3_benchmark.py --engine-label auto --compare step3_result_fs.json ## Key Concepts -- **Effectiveness** tests use real code and measure exact recall/precision against ground truth -- **Performance** tests use synthetic data with known injection probabilities for approximate recall +- **Effectiveness** tests compare grep results against ground truth from fs-engine grep (cached locally) +- **Performance** tests compare grep latency and match counts between engine configs (ground truth from fs-engine grep, cached) +- Both ground truth caches are stored in `~/.openviking/data/benchmark/.ground_truth/` +- First run of each step3 MUST use `engine=fs` in ov.conf to generate ground truth; subsequent runs can use any engine - Both follow the same workflow: import (no indexing) → reindex → benchmark/evaluate - Both support **resumable** execution via progress files (separate for import and reindex) - Change grep engine via `ov.conf` and restart the server between benchmark runs diff --git a/benchmark/retrieval/grep/vikingdb_bm25/README_CN.md b/benchmark/retrieval/grep/vikingdb_bm25/README_CN.md index 23caec61f5..5f8f12a8af 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/README_CN.md +++ b/benchmark/retrieval/grep/vikingdb_bm25/README_CN.md @@ -27,8 +27,8 @@ vikingdb_bm25/ | 步骤 | 脚本 | 说明 | |------|------|------| | 1 | `step1_add_resource.py` | 导入代码仓库(不建索引,速度快) | -| 2 | `step2_reindex.py` | 通过 openviking-server 异步构建索引(并发=2,轮询) | -| 3 | `step3_quality.py` | SDK grep 与本地正则 ground truth 对比 | +| 2 | `step2_reindex.py` | 通过 openviking-server 异步构建索引(并发=16,轮询) | +| 3 | `step3_quality.py` | SDK grep 与 fs 引擎 ground truth 对比(缓存) | ### 使用方法 @@ -39,10 +39,20 @@ python3 step1_add_resource.py --source ~/.openviking/data/benchmark/OpenViking-m # 步骤 2:构建向量索引(需 openviking-server 运行中) python3 step2_reindex.py -# 可选参数:--concurrency N (默认:2) +# 可选参数:--concurrency N (默认:16) -# 步骤 3:编辑 step3_quality.py 中的 KEYWORDS 列表,然后运行 -python3 step3_quality.py +# 步骤 3:评估检索质量 +# 首次运行必须使用 engine=fs 生成 ground truth 缓存: +# 1. 设置 ov.conf: "grep": {"engine": "fs"} +# 2. 重启服务 +python3 step3_quality.py --keywords grep reindex SyncHTTPClient + +# 后续运行可使用任意引擎(ground truth 从缓存读取): +# 1. 设置 ov.conf: "grep": {"engine": "auto", "switch_to_remote_threshold": 0} +# 2. 重启服务 +python3 step3_quality.py --keywords grep reindex SyncHTTPClient + +# 可选参数:--regenerate-ground-truth (强制重算,需 engine=fs) ``` ## Performance — 检索性能 @@ -55,8 +65,8 @@ python3 step3_quality.py |------|------|------| | 0 | `step0_prepare_data.py` | 生成合成数据集(dir_xxx/wiki_xxx.txt) | | 1 | `step1_add_resource.py` | 导入数据(不建索引,速度快) | -| 2 | `step2_reindex.py` | 通过 openviking-server 异步构建索引(并发=2,轮询) | -| 3 | `step3_benchmark.py` | 测量延迟和召回率 | +| 2 | `step2_reindex.py` | 通过 openviking-server 异步构建索引(并发=16,轮询) | +| 3 | `step3_benchmark.py` | 测量延迟和召回率(ground truth 来自 fs 引擎,缓存) | ### 目标单词 @@ -82,10 +92,10 @@ python3 step1_add_resource.py # 步骤 2:构建向量索引(需 openviking-server 运行中) python3 step2_reindex.py -# 可选参数:--concurrency N (默认:2) +# 可选参数:--concurrency N (默认:16) # 步骤 3:基准测试 — 用不同引擎配置各跑一次 -# 运行 A:fs 引擎 +# 运行 A:fs 引擎(首次运行同时生成 ground truth 缓存) # 1. 设置 ov.conf: "grep": {"engine": "fs"} # 2. 重启服务 python3 step3_benchmark.py --engine-label fs @@ -98,8 +108,10 @@ python3 step3_benchmark.py --engine-label auto --compare step3_result_fs.json ## 核心概念 -- **Effectiveness(效果测试)** 使用真实代码,对照 ground truth 计算精确的召回率/精确率 -- **Performance(性能测试)** 使用合成数据,根据已知注入概率计算近似召回率 +- **Effectiveness(效果测试)** 将 grep 结果与 fs 引擎的 ground truth 对比(本地缓存) +- **Performance(性能测试)** 对比不同引擎的延迟和匹配数(ground truth 来自 fs 引擎,本地缓存) +- 两个 ground truth 缓存均存储在 `~/.openviking/data/benchmark/.ground_truth/` +- 每个 step3 首次运行必须使用 ov.conf 的 `engine=fs` 来生成 ground truth;后续运行可使用任意引擎 - 两者遵循相同流程:导入(不建索引)→ 构建索引 → 评估/测试 - 两者均支持**断点续传**(导入和索引各有独立进度文件) - 切换 grep 引擎需修改 `ov.conf` 并重启服务,在不同运行之间对比 diff --git a/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step3_quality.py b/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step3_quality.py index 80500415fe..63c9c75b24 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step3_quality.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step3_quality.py @@ -1,46 +1,79 @@ #!/usr/bin/env python3 """Step 3 (Effectiveness): Evaluate retrieval quality for real code repos. -Compares SDK grep results against local regex ground truth. +Compares grep results (current engine) against ground truth from fs-engine grep. Computes Recall, Precision, F1 per query pattern. +Ground truth is obtained by running grep with engine=fs (must be configured +in ov.conf on first run). Results are cached locally so subsequent runs +can use a different engine config while still comparing against the same +ground truth. + Prerequisites: 1. Run step1_add_resource.py to import repos (no indexing) 2. Run step2_reindex.py to build vector indexes - 3. Ensure ov.conf has the desired grep engine config - -KEYWORDS: Fill the KEYWORDS list below with real terms from the imported -repos before running. + 3. First run: set ov.conf grep engine to "fs" and restart server Usage: - python3 step3_quality.py + python3 step3_quality.py --keywords grep reindex SyncHTTPClient """ from __future__ import annotations +import hashlib +import json import os import re import time -from openviking.sync_client import SyncOpenViking +from openviking_cli.client.sync_http import SyncHTTPClient BASE_URI = "viking://resources/benchmark/effectiveness" DATA_DIR = os.path.expanduser("~/.openviking/data/benchmark") - -KEYWORDS: list[str] = [] - - -def build_test_patterns() -> list[tuple[str, str]]: +GROUND_TRUTH_DIR = os.path.join(DATA_DIR, ".ground_truth") +MISS_DIR = os.path.join(DATA_DIR, ".miss") + +KEYWORDS: list[str] = [ + # High frequency English + "embedding", + "grep", + # Medium frequency English + "vikingdb", + "reindex", + # Low frequency English + "build_index", + # CamelCase + "SyncHTTPClient", + "MarkdownParser", + "DataDirectoryLocked", + # snake_case + "add_resource", + "process_lock", + # Chinese + "检索", + "向量数据库", +] # Can also be overridden via --keywords + + +def _sanitize_filename(s: str, max_len: int = 40) -> str: + """Make a string safe for use as a filename component.""" + s = re.sub(r"[^a-zA-Z0-9_\-]", "_", s) + s = s.strip("_") + return s[:max_len] + + +def build_test_patterns(keywords: list[str] | None = None) -> list[tuple[str, str]]: + kws = keywords if keywords else KEYWORDS patterns = [] - for kw in KEYWORDS: + for kw in kws: patterns.append((f"keyword: {kw}", kw)) - if len(KEYWORDS) >= 2: - patterns.append((f"multi 2: {KEYWORDS[0]}|{KEYWORDS[1]}", f"{KEYWORDS[0]}|{KEYWORDS[1]}")) + if len(kws) >= 2: + patterns.append((f"multi 2: {kws[0]}|{kws[1]}", f"{kws[0]}|{kws[1]}")) patterns.append(("no-match: zzz_nonexistent_quality", "zzz_nonexistent_quality")) return patterns -def run_sdk_grep(client: SyncOpenViking, uri: str, pattern: str) -> tuple[set[str], float]: +def run_sdk_grep(client: SyncHTTPClient, uri: str, pattern: str) -> tuple[set[str], float]: t0 = time.monotonic() result = client.grep(uri=uri, pattern=pattern, node_limit=100000) elapsed = time.monotonic() - t0 @@ -53,56 +86,74 @@ def run_sdk_grep(client: SyncOpenViking, uri: str, pattern: str) -> tuple[set[st return uris, elapsed -def local_path_to_viking_uri(filepath: str) -> str: - rel = os.path.relpath(filepath, DATA_DIR) - return "viking://resources/" + rel.replace(os.sep, "/").rstrip("/") - - -def compute_ground_truth(pattern: str, search_dirs: list[str]) -> tuple[set[str], float]: - compiled = re.compile(pattern) - truth_uris = set() - t0 = time.monotonic() - for search_dir in search_dirs: - if not os.path.isdir(search_dir): - continue - for root, dirs, files in os.walk(search_dir): - dirs.sort() - for fname in sorted(files): - if not ( - fname.endswith(".py") - or fname.endswith(".md") - or fname.endswith(".rs") - or fname.endswith(".toml") - or fname.endswith(".yaml") - or fname.endswith(".yml") - or fname.endswith(".json") - or fname.endswith(".txt") - or fname.endswith(".cfg") - or fname.endswith(".ini") - ): - continue - filepath = os.path.join(root, fname) - try: - with open(filepath, errors="ignore") as f: - content = f.read() - if compiled.search(content): - truth_uris.add(local_path_to_viking_uri(filepath)) - except Exception: - pass - elapsed = time.monotonic() - t0 +def _cache_filename(prefix: str, pattern: str, uri: str) -> str: + """Generate filename: {prefix}_{sanitized_pattern}_{hash}.json""" + h = ( + hashlib.sha256(uri.encode()).update(pattern.encode()) + or hashlib.sha256(uri.encode() + pattern.encode()).hexdigest()[:8] + ) + safe_pattern = _sanitize_filename(pattern) + return f"{prefix}_{safe_pattern}_{h}.json" + + +def _ground_truth_cache_path(pattern: str, uri: str) -> str: + h = hashlib.sha256(uri.encode() + pattern.encode()).hexdigest()[:8] + safe_pattern = _sanitize_filename(pattern) + return os.path.join(GROUND_TRUTH_DIR, f"eff_{safe_pattern}_{h}.json") + + +def _load_ground_truth_cache(pattern: str, uri: str) -> set[str] | None: + path = _ground_truth_cache_path(pattern, uri) + if not os.path.isfile(path): + # Fallback: try old-style filename for backward compat + old_h = hashlib.sha256(uri.encode()).hexdigest() + old_h += hashlib.sha256(pattern.encode()).hexdigest()[:16] + old_path = os.path.join(GROUND_TRUTH_DIR, f"eff_{old_h[:16]}.json") + if os.path.isfile(old_path): + with open(old_path) as f: + data = json.load(f) + return set(data.get("uris", [])) + return None + with open(path) as f: + data = json.load(f) + return set(data.get("uris", [])) + + +def _save_ground_truth_cache(pattern: str, uri: str, uris: set[str]) -> None: + os.makedirs(GROUND_TRUTH_DIR, exist_ok=True) + path = _ground_truth_cache_path(pattern, uri) + with open(path, "w") as f: + json.dump({"pattern": pattern, "uri": uri, "uris": sorted(uris)}, f, indent=2) + + +def compute_ground_truth(client: SyncHTTPClient, uri: str, pattern: str) -> tuple[set[str], float]: + """Compute ground truth via OV grep (fs engine). First run must have engine=fs.""" + cached = _load_ground_truth_cache(pattern, uri) + if cached is not None: + return cached, 0.0 + + truth_uris, elapsed = run_sdk_grep(client, uri, pattern) + _save_ground_truth_cache(pattern, uri, truth_uris) return truth_uris, elapsed -def discover_local_repo_dirs() -> list[str]: - benchmark_dir = os.path.join(DATA_DIR, "benchmark") - if not os.path.isdir(benchmark_dir): - return [] - dirs = [] - for entry in sorted(os.listdir(benchmark_dir)): - path = os.path.join(benchmark_dir, entry) - if os.path.isdir(path) and not entry.startswith("."): - dirs.append(path) - return dirs +def _save_miss(pattern: str, uri: str, missed_uris: set[str], extra_uris: set[str]) -> None: + """Save miss analysis (FN and FP) to .miss directory.""" + if not missed_uris and not extra_uris: + return + os.makedirs(MISS_DIR, exist_ok=True) + h = hashlib.sha256(uri.encode() + pattern.encode()).hexdigest()[:8] + safe_pattern = _sanitize_filename(pattern) + path = os.path.join(MISS_DIR, f"eff_{safe_pattern}_{h}.json") + data: dict = {"pattern": pattern, "uri": uri} + if missed_uris: + data["missed_fn"] = sorted(missed_uris) + data["missed_fn_count"] = len(missed_uris) + if extra_uris: + data["extra_fp"] = sorted(extra_uris) + data["extra_fp_count"] = len(extra_uris) + with open(path, "w") as f: + json.dump(data, f, indent=2) def compute_metrics(truth: set[str], predicted: set[str]) -> dict: @@ -120,41 +171,69 @@ def compute_metrics(truth: set[str], predicted: set[str]) -> dict: def main(): - uri = BASE_URI - search_dirs = discover_local_repo_dirs() + import argparse - if not search_dirs: - print(f"Error: No repo directories found under {DATA_DIR}/benchmark/") - print("Run step1_add_resource.py first.") - return + parser = argparse.ArgumentParser( + description="Step 3 (Effectiveness): Evaluate retrieval quality" + ) + parser.add_argument( + "--keywords", + nargs="+", + default=None, + help="Keywords to search (e.g. --keywords grep reindex SyncHTTPClient)", + ) + parser.add_argument( + "--regenerate-ground-truth", + action="store_true", + help="Regenerate ground truth cache (requires engine=fs in ov.conf)", + ) + args = parser.parse_args() - if not KEYWORDS: + uri = BASE_URI + keywords = args.keywords if args.keywords else KEYWORDS + if not keywords: print("WARNING: KEYWORDS list is empty. Fill it with real terms before running.") - print(" Edit step2_quality.py and add keywords to the KEYWORDS list.\n") + print(" Use --keywords kw1 kw2 ... or edit step3_quality.py.\n") - test_patterns = build_test_patterns() + test_patterns = build_test_patterns(keywords) print("=" * 110) - print("Effectiveness Evaluation: SDK grep vs local regex (ground truth)") + print("Effectiveness Evaluation: grep vs ground truth (fs engine)") print("=" * 110) print(f"URI: {uri}") - print(f"Data dir: {DATA_DIR}/benchmark/") print(f"Patterns: {len(test_patterns)}") print() + print("NOTE: First run requires ov.conf grep engine=fs to generate ground truth.") + print(" Subsequent runs can use any engine; cached ground truth is reused.") + print() - results = [] - client = SyncOpenViking() + client = SyncHTTPClient(account="default", user="default") client.initialize() + # Phase 1: Compute ground truth (needs fs engine on first run) + print("--- Phase 1: Ground truth (fs engine) ---") + ground_truth_map: dict[str, tuple[set[str], float]] = {} + for label, pattern in test_patterns: + if args.regenerate_ground_truth: + cache_path = _ground_truth_cache_path(pattern, uri) + if os.path.isfile(cache_path): + os.remove(cache_path) + truth_uris, gt_elapsed = compute_ground_truth(client, uri, pattern) + ground_truth_map[pattern] = (truth_uris, gt_elapsed) + cached_str = "(cached)" if gt_elapsed == 0.0 else f"({gt_elapsed:.2f}s)" + print(f" {label}: {len(truth_uris)} matches {cached_str}") + + print() + print("--- Phase 2: Evaluate with current engine ---") + + results = [] try: for label, pattern in test_patterns: - print(f"--- {label} (pattern: {pattern}) ---") - truth_uris, fs_elapsed = compute_ground_truth(pattern, search_dirs) - print(f" Ground truth (local fs): {len(truth_uris)} matches ({fs_elapsed:.2f}s)") + truth_uris, gt_elapsed = ground_truth_map[pattern] try: auto_uris, auto_elapsed = run_sdk_grep(client, uri, pattern) except Exception as e: - print(f" SDK grep FAILED: {e}") + print(f" {label} FAILED: {e}") results.append( { "label": label, @@ -164,18 +243,21 @@ def main(): } ) continue - print(f" SDK grep: {len(auto_uris)} matches ({auto_elapsed:.2f}s)") metrics = compute_metrics(truth_uris, auto_uris) + + # Save miss analysis + missed_uris = truth_uris - auto_uris # FN + extra_uris = auto_uris - truth_uris # FP + _save_miss(pattern, uri, missed_uris, extra_uris) + + miss_str = f" missed={len(missed_uris)}" if missed_uris else "" + extra_str = f" extra={len(extra_uris)}" if extra_uris else "" print( - f" Recall: {metrics['recall']:.4f} " - f"Precision: {metrics['precision']:.4f} " - f"F1: {metrics['f1']:.4f}" + f" {label}: truth={len(truth_uris)} found={len(auto_uris)} " + f"Recall={metrics['recall']:.4f} Prec={metrics['precision']:.4f} F1={metrics['f1']:.4f}" + f"{miss_str}{extra_str}" ) - if metrics["fn"] > 0: - print(f" Missed (FN): {metrics['fn']}") - if metrics["fp"] > 0: - print(f" Extra (FP): {metrics['fp']}") results.append( { @@ -183,7 +265,7 @@ def main(): "pattern": pattern, "truth_count": len(truth_uris), "found_count": len(auto_uris), - "fs_elapsed_s": round(fs_elapsed, 3), + "gt_elapsed_s": round(gt_elapsed, 3), "sdk_elapsed_s": round(auto_elapsed, 3), **metrics, } @@ -194,21 +276,23 @@ def main(): print() print("=" * 120) print( - f"{'Label':<45} {'Truth':>6} {'Found':>6} {'Recall':>8} {'Prec':>8} {'F1':>8} {'Missed':>8}" + f"{'Label':<45} {'Truth':>6} {'Found':>6} {'Recall':>8} {'Prec':>8} {'F1':>8} {'Missed':>8} {'Extra':>8}" ) print("-" * 120) for r in results: if "error" in r: print( f"{r['label']:<45} {r['truth_count']:>6} {'ERR':>6} " - f"{'---':>8} {'---':>8} {'---':>8} {'---':>8}" + f"{'---':>8} {'---':>8} {'---':>8} {'---':>8} {'---':>8}" ) else: print( f"{r['label']:<45} {r['truth_count']:>6} {r['found_count']:>6} " - f"{r['recall']:>8.4f} {r['precision']:>8.4f} {r['f1']:>8.4f} {r['fn']:>8}" + f"{r['recall']:>8.4f} {r['precision']:>8.4f} {r['f1']:>8.4f} {r['fn']:>8} {r['fp']:>8}" ) print() + print(f"Miss analysis saved to: {MISS_DIR}/") + print(f"Ground truth cache: {GROUND_TRUTH_DIR}/") if __name__ == "__main__": diff --git a/benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py b/benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py index b8725c5b6e..03395b5740 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py @@ -2,12 +2,13 @@ """Step 3 (Performance): Benchmark grep performance and recall. Runs grep queries against the synthetic dataset, measuring both latency -and recall. Target words from step0 are used as test queries, with -expected hit counts computed from the injection probabilities. +and recall. Ground truth (match counts per word) is obtained by running +grep with engine=fs on first run, then cached. Run twice with different ov.conf engine settings to compare: 1. Set ov.conf: "grep": {"engine": "fs"}, restart, then: python3 step3_benchmark.py --engine-label fs + (This also generates the ground truth cache) 2. Set ov.conf: "grep": {"engine": "auto", "switch_to_remote_threshold": 0}, restart, then: python3 step3_benchmark.py --engine-label auto --compare step3_result_fs.json @@ -17,14 +18,19 @@ from __future__ import annotations import argparse +import hashlib import json import os +import re import time -from openviking.sync_client import SyncOpenViking +from openviking_cli.client.sync_http import SyncHTTPClient BASE_URI = "viking://resources/benchmark/performance" -DATA_DIR = os.path.expanduser("~/.openviking/data/benchmark/synthetic") +DATA_DIR = os.path.expanduser("~/.openviking/data/benchmark") +GROUND_TRUTH_DIR = os.path.join(DATA_DIR, ".ground_truth") +MISS_DIR = os.path.join(DATA_DIR, ".miss") +SYNTHETIC_DIR = os.path.expanduser("~/.openviking/data/benchmark/synthetic") # Same target words as step0_prepare_data.py TARGET_WORDS = { @@ -37,37 +43,148 @@ RUNS = 3 WARMUP = 1 +GROUND_TRUTH_DIR = os.path.join(DATA_DIR, ".ground_truth") +MISS_DIR = os.path.join(DATA_DIR, ".miss") +SYNTHETIC_DIR = os.path.expanduser("~/.openviking/data/benchmark/synthetic") + + +def _sanitize_filename(s: str, max_len: int = 40) -> str: + s = re.sub(r"[^a-zA-Z0-9_\-]", "_", s) + s = s.strip("_") + return s[:max_len] + + +def _perf_cache_path(uri: str) -> str: + h = hashlib.sha256(uri.encode()).hexdigest()[:8] + return os.path.join(GROUND_TRUTH_DIR, f"perf_{h}.json") + + +def _load_ground_truth_cache(uri: str) -> dict[str, int] | None: + path = _perf_cache_path(uri) + if not os.path.isfile(path): + # Fallback: try old-style filename + old_h = hashlib.sha256(SYNTHETIC_DIR.encode()) + for prob in sorted(TARGET_WORDS.keys()): + for word in TARGET_WORDS[prob]: + old_h.update(word.encode()) + old_key = old_h.hexdigest()[:16] + old_path = os.path.join(GROUND_TRUTH_DIR, f"perf_{old_key}.json") + if os.path.isfile(old_path): + with open(old_path) as f: + data = json.load(f) + return data.get("word_counts") + return None + with open(path) as f: + data = json.load(f) + return data.get("word_counts") + + +def _save_ground_truth_cache(uri: str, word_counts: dict[str, int]) -> None: + os.makedirs(GROUND_TRUTH_DIR, exist_ok=True) + path = _perf_cache_path(uri) + with open(path, "w") as f: + json.dump({"uri": uri, "word_counts": word_counts}, f, indent=2) + + +def _perf_miss_path(engine_label: str) -> str: + h = hashlib.sha256(BASE_URI.encode()).hexdigest()[:8] + safe_label = _sanitize_filename(engine_label) + return os.path.join(MISS_DIR, f"perf_{safe_label}_{h}.json") + + +def _save_perf_miss( + engine_label: str, + results: list[dict], + ground_truth: dict[str, int], +) -> None: + """Save miss analysis (count diff per word) for performance benchmark.""" + miss_data: list[dict] = [] + has_miss = False + for r in results: + if "error" in r or "word" not in r: + continue + word = r["word"] + found = r.get("matches", 0) + expected = ground_truth.get(word, 0) + if found != expected or expected > 0: + miss_data.append( + { + "word": word, + "probability": r.get("probability"), + "expected": expected, + "found": found, + "diff": found - expected, + "recall_approx": r.get("recall_approx"), + } + ) + if found < expected: + has_miss = True + if not has_miss: + return + os.makedirs(MISS_DIR, exist_ok=True) + path = _perf_miss_path(engine_label) + with open(path, "w") as f: + json.dump({"engine_label": engine_label, "uri": BASE_URI, "misses": miss_data}, f, indent=2) + + +def compute_ground_truth(client: SyncHTTPClient, uri: str) -> tuple[dict[str, int], float]: + """Compute ground truth via OV grep (fs engine). Returns word -> match count.""" + cached = _load_ground_truth_cache(uri) + if cached is not None: + return cached, 0.0 + + all_words = [] + for prob in sorted(TARGET_WORDS.keys()): + all_words.extend(TARGET_WORDS[prob]) + + word_counts: dict[str, int] = {} + t0 = time.monotonic() + for w in all_words: + result = client.grep(uri=uri, pattern=w, node_limit=100000) + count = 0 + if isinstance(result, dict): + count = len(result.get("matches", [])) + word_counts[w] = count + elapsed = time.monotonic() - t0 + + _save_ground_truth_cache(uri, word_counts) + return word_counts, elapsed + def count_local_files() -> int: """Count total .txt files in the synthetic dataset.""" count = 0 - if not os.path.isdir(DATA_DIR): + if not os.path.isdir(SYNTHETIC_DIR): return 0 - for root, _dirs, files in os.walk(DATA_DIR): + for _root, _dirs, files in os.walk(SYNTHETIC_DIR): for f in files: if f.endswith(".txt"): count += 1 return count -def run_grep(client: SyncOpenViking, pattern: str, uri: str) -> tuple[float, int]: +def run_grep(client: SyncHTTPClient, pattern: str, uri: str) -> tuple[float, int, set[str]]: start = time.monotonic() result = client.grep(uri=uri, pattern=pattern, node_limit=100000) elapsed = time.monotonic() - start - match_count = 0 + match_uris: set[str] = set() if isinstance(result, dict): - matches = result.get("matches", []) - match_count = len(matches) - return elapsed, match_count + for match in result.get("matches", []): + uri_val = match.get("uri", "") + if uri_val: + match_uris.add(uri_val.rstrip("/")) + return elapsed, len(match_uris), match_uris -def benchmark_engine(client: SyncOpenViking, total_files: int) -> list[dict]: +def benchmark_engine( + client: SyncHTTPClient, total_files: int, ground_truth: dict[str, int] +) -> list[dict]: results = [] for prob in sorted(TARGET_WORDS.keys(), reverse=True): words = TARGET_WORDS[prob] for word in words: - expected = int(total_files * prob) + expected = ground_truth.get(word, int(total_files * prob)) label = f"{word} (p={prob * 100:.2f}%, expect~{expected})" print(f" {label} ...", end=" ", flush=True) @@ -85,7 +202,7 @@ def benchmark_engine(client: SyncOpenViking, total_files: int) -> list[dict]: failed = False for _ in range(RUNS): try: - elapsed, matches = run_grep(client, word, BASE_URI) + elapsed, matches, _ = run_grep(client, word, BASE_URI) times.append(elapsed) match_count = matches except Exception as e: @@ -132,7 +249,7 @@ def benchmark_engine(client: SyncOpenViking, total_files: int) -> list[dict]: failed = False for _ in range(RUNS): try: - elapsed, matches = run_grep(client, "zzz_nonexistent_perf", BASE_URI) + elapsed, matches, _ = run_grep(client, "zzz_nonexistent_perf", BASE_URI) times.append(elapsed) match_count = matches except Exception as e: @@ -216,6 +333,16 @@ def main(): total_files = count_local_files() + client = SyncHTTPClient(account="default", user="default") + client.initialize() + + print("Computing ground truth (OV grep, fs engine)...") + ground_truth, gt_elapsed = compute_ground_truth(client, BASE_URI) + if gt_elapsed > 0: + print(f" Ground truth computed in {gt_elapsed:.1f}s") + else: + print(" Ground truth loaded from cache") + print("=" * 80) print(f"Step 3 (Performance): Grep Benchmark — engine={args.engine_label}") print("=" * 80) @@ -226,11 +353,8 @@ def main(): print("Ensure ov.conf has the desired grep config and the server is restarted.") print() - client = SyncOpenViking() - client.initialize() - try: - results = benchmark_engine(client, total_files) + results = benchmark_engine(client, total_files, ground_truth) finally: client.close() @@ -243,6 +367,9 @@ def main(): ) print(f"\nResults saved to {output_file}") + # Save miss analysis + _save_perf_miss(args.engine_label, results, ground_truth) + print() print( f"{'Word':<20} {'Prob':>8} {'Avg(ms)':>10} {'Min(ms)':>10} {'Max(ms)':>10} {'Matches':>10} {'Expect~':>10} {'Recall~':>10}" @@ -270,6 +397,9 @@ def main(): prev_results = prev.get("results", []) print_comparison(args.engine_label, results, prev_label, prev_results) + print(f"\nMiss analysis saved to: {MISS_DIR}/") + print(f"Ground truth cache: {GROUND_TRUTH_DIR}/") + if __name__ == "__main__": main() From 599ae64946229aa9f8bc80775cc76a8740c58f42 Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Mon, 1 Jun 2026 21:18:14 +0800 Subject: [PATCH 17/31] fix: truncate 64KB for content field --- .../effectiveness/step3_quality.py | 72 +++++++++++-------- .../performance/step3_benchmark.py | 30 ++++---- openviking/storage/ovpack/vectors.py | 2 +- .../queuefs/embedding_msg_converter.py | 3 +- openviking/storage/vectordb_adapters/base.py | 35 +++++++++ .../storage/vectordb_adapters/http_adapter.py | 9 ++- .../vikingdb_private_adapter.py | 4 +- .../vectordb_adapters/volcengine_adapter.py | 4 +- 8 files changed, 112 insertions(+), 47 deletions(-) diff --git a/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step3_quality.py b/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step3_quality.py index 63c9c75b24..5fa249a208 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step3_quality.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step3_quality.py @@ -29,9 +29,10 @@ from openviking_cli.client.sync_http import SyncHTTPClient BASE_URI = "viking://resources/benchmark/effectiveness" -DATA_DIR = os.path.expanduser("~/.openviking/data/benchmark") +DATA_DIR = os.path.expanduser("~/.openviking/data/benchmark/effectiveness") GROUND_TRUTH_DIR = os.path.join(DATA_DIR, ".ground_truth") MISS_DIR = os.path.join(DATA_DIR, ".miss") +RESULT_DIR = os.path.join(DATA_DIR, ".result") KEYWORDS: list[str] = [ # High frequency English @@ -56,12 +57,17 @@ def _sanitize_filename(s: str, max_len: int = 40) -> str: - """Make a string safe for use as a filename component.""" - s = re.sub(r"[^a-zA-Z0-9_\-]", "_", s) - s = s.strip("_") + """Make a string safe for use as a filename component. Preserves Unicode.""" + s = re.sub(r'[/\\:*?"<>|\0]', "_", s) + s = s.strip("_ ") return s[:max_len] +def _cache_hash(uri: str, pattern: str) -> str: + """Short hash for cache disambiguation.""" + return hashlib.sha256(uri.encode("utf-8") + pattern.encode("utf-8")).hexdigest()[:8] + + def build_test_patterns(keywords: list[str] | None = None) -> list[tuple[str, str]]: kws = keywords if keywords else KEYWORDS patterns = [] @@ -86,18 +92,8 @@ def run_sdk_grep(client: SyncHTTPClient, uri: str, pattern: str) -> tuple[set[st return uris, elapsed -def _cache_filename(prefix: str, pattern: str, uri: str) -> str: - """Generate filename: {prefix}_{sanitized_pattern}_{hash}.json""" - h = ( - hashlib.sha256(uri.encode()).update(pattern.encode()) - or hashlib.sha256(uri.encode() + pattern.encode()).hexdigest()[:8] - ) - safe_pattern = _sanitize_filename(pattern) - return f"{prefix}_{safe_pattern}_{h}.json" - - def _ground_truth_cache_path(pattern: str, uri: str) -> str: - h = hashlib.sha256(uri.encode() + pattern.encode()).hexdigest()[:8] + h = _cache_hash(uri, pattern) safe_pattern = _sanitize_filename(pattern) return os.path.join(GROUND_TRUTH_DIR, f"eff_{safe_pattern}_{h}.json") @@ -105,16 +101,16 @@ def _ground_truth_cache_path(pattern: str, uri: str) -> str: def _load_ground_truth_cache(pattern: str, uri: str) -> set[str] | None: path = _ground_truth_cache_path(pattern, uri) if not os.path.isfile(path): - # Fallback: try old-style filename for backward compat - old_h = hashlib.sha256(uri.encode()).hexdigest() - old_h += hashlib.sha256(pattern.encode()).hexdigest()[:16] + # Fallback: try old-style hash-only filename + old_h = hashlib.sha256(uri.encode("utf-8")).hexdigest() + old_h += hashlib.sha256(pattern.encode("utf-8")).hexdigest()[:16] old_path = os.path.join(GROUND_TRUTH_DIR, f"eff_{old_h[:16]}.json") if os.path.isfile(old_path): - with open(old_path) as f: + with open(old_path, encoding="utf-8") as f: data = json.load(f) return set(data.get("uris", [])) return None - with open(path) as f: + with open(path, encoding="utf-8") as f: data = json.load(f) return set(data.get("uris", [])) @@ -122,8 +118,10 @@ def _load_ground_truth_cache(pattern: str, uri: str) -> set[str] | None: def _save_ground_truth_cache(pattern: str, uri: str, uris: set[str]) -> None: os.makedirs(GROUND_TRUTH_DIR, exist_ok=True) path = _ground_truth_cache_path(pattern, uri) - with open(path, "w") as f: - json.dump({"pattern": pattern, "uri": uri, "uris": sorted(uris)}, f, indent=2) + with open(path, "w", encoding="utf-8") as f: + json.dump( + {"pattern": pattern, "uri": uri, "uris": sorted(uris)}, f, indent=2, ensure_ascii=False + ) def compute_ground_truth(client: SyncHTTPClient, uri: str, pattern: str) -> tuple[set[str], float]: @@ -137,14 +135,18 @@ def compute_ground_truth(client: SyncHTTPClient, uri: str, pattern: str) -> tupl return truth_uris, elapsed +def _miss_cache_path(pattern: str, uri: str) -> str: + h = _cache_hash(uri, pattern) + safe_pattern = _sanitize_filename(pattern) + return os.path.join(MISS_DIR, f"eff_{safe_pattern}_{h}.json") + + def _save_miss(pattern: str, uri: str, missed_uris: set[str], extra_uris: set[str]) -> None: """Save miss analysis (FN and FP) to .miss directory.""" if not missed_uris and not extra_uris: return os.makedirs(MISS_DIR, exist_ok=True) - h = hashlib.sha256(uri.encode() + pattern.encode()).hexdigest()[:8] - safe_pattern = _sanitize_filename(pattern) - path = os.path.join(MISS_DIR, f"eff_{safe_pattern}_{h}.json") + path = _miss_cache_path(pattern, uri) data: dict = {"pattern": pattern, "uri": uri} if missed_uris: data["missed_fn"] = sorted(missed_uris) @@ -152,8 +154,8 @@ def _save_miss(pattern: str, uri: str, missed_uris: set[str], extra_uris: set[st if extra_uris: data["extra_fp"] = sorted(extra_uris) data["extra_fp_count"] = len(extra_uris) - with open(path, "w") as f: - json.dump(data, f, indent=2) + with open(path, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2, ensure_ascii=False) def compute_metrics(truth: set[str], predicted: set[str]) -> dict: @@ -273,6 +275,7 @@ def main(): finally: client.close() + # Summary table print() print("=" * 120) print( @@ -291,8 +294,19 @@ def main(): f"{r['recall']:>8.4f} {r['precision']:>8.4f} {r['f1']:>8.4f} {r['fn']:>8} {r['fp']:>8}" ) print() - print(f"Miss analysis saved to: {MISS_DIR}/") - print(f"Ground truth cache: {GROUND_TRUTH_DIR}/") + + # Save results to local file + output_file = os.path.join(RESULT_DIR, "step3_result.json") + with open(output_file, "w", encoding="utf-8") as f: + json.dump( + {"uri": uri, "patterns": len(test_patterns), "results": results}, + f, + indent=2, + ensure_ascii=False, + ) + print(f"Results saved to: {output_file}") + print(f"Miss analysis saved to: {MISS_DIR}/") + print(f"Ground truth cache: {GROUND_TRUTH_DIR}/") if __name__ == "__main__": diff --git a/benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py b/benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py index 03395b5740..ec06b0657b 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py @@ -27,7 +27,7 @@ from openviking_cli.client.sync_http import SyncHTTPClient BASE_URI = "viking://resources/benchmark/performance" -DATA_DIR = os.path.expanduser("~/.openviking/data/benchmark") +DATA_DIR = os.path.expanduser("~/.openviking/data/benchmark/performance") GROUND_TRUTH_DIR = os.path.join(DATA_DIR, ".ground_truth") MISS_DIR = os.path.join(DATA_DIR, ".miss") SYNTHETIC_DIR = os.path.expanduser("~/.openviking/data/benchmark/synthetic") @@ -55,7 +55,7 @@ def _sanitize_filename(s: str, max_len: int = 40) -> str: def _perf_cache_path(uri: str) -> str: - h = hashlib.sha256(uri.encode()).hexdigest()[:8] + h = hashlib.sha256(uri.encode("utf-8")).hexdigest()[:8] return os.path.join(GROUND_TRUTH_DIR, f"perf_{h}.json") @@ -63,18 +63,18 @@ def _load_ground_truth_cache(uri: str) -> dict[str, int] | None: path = _perf_cache_path(uri) if not os.path.isfile(path): # Fallback: try old-style filename - old_h = hashlib.sha256(SYNTHETIC_DIR.encode()) + old_h = hashlib.sha256(SYNTHETIC_DIR.encode("utf-8")) for prob in sorted(TARGET_WORDS.keys()): for word in TARGET_WORDS[prob]: - old_h.update(word.encode()) + old_h.update(word.encode("utf-8")) old_key = old_h.hexdigest()[:16] old_path = os.path.join(GROUND_TRUTH_DIR, f"perf_{old_key}.json") if os.path.isfile(old_path): - with open(old_path) as f: + with open(old_path, encoding="utf-8") as f: data = json.load(f) return data.get("word_counts") return None - with open(path) as f: + with open(path, encoding="utf-8") as f: data = json.load(f) return data.get("word_counts") @@ -82,12 +82,12 @@ def _load_ground_truth_cache(uri: str) -> dict[str, int] | None: def _save_ground_truth_cache(uri: str, word_counts: dict[str, int]) -> None: os.makedirs(GROUND_TRUTH_DIR, exist_ok=True) path = _perf_cache_path(uri) - with open(path, "w") as f: - json.dump({"uri": uri, "word_counts": word_counts}, f, indent=2) + with open(path, "w", encoding="utf-8") as f: + json.dump({"uri": uri, "word_counts": word_counts}, f, indent=2, ensure_ascii=False) def _perf_miss_path(engine_label: str) -> str: - h = hashlib.sha256(BASE_URI.encode()).hexdigest()[:8] + h = hashlib.sha256(BASE_URI.encode("utf-8")).hexdigest()[:8] safe_label = _sanitize_filename(engine_label) return os.path.join(MISS_DIR, f"perf_{safe_label}_{h}.json") @@ -123,8 +123,13 @@ def _save_perf_miss( return os.makedirs(MISS_DIR, exist_ok=True) path = _perf_miss_path(engine_label) - with open(path, "w") as f: - json.dump({"engine_label": engine_label, "uri": BASE_URI, "misses": miss_data}, f, indent=2) + with open(path, "w", encoding="utf-8") as f: + json.dump( + {"engine_label": engine_label, "uri": BASE_URI, "misses": miss_data}, + f, + indent=2, + ensure_ascii=False, + ) def compute_ground_truth(client: SyncHTTPClient, uri: str) -> tuple[dict[str, int], float]: @@ -359,11 +364,12 @@ def main(): client.close() output_file = f"step3_result_{args.engine_label}.json" - with open(output_file, "w") as f: + with open(output_file, "w", encoding="utf-8") as f: json.dump( {"engine_label": args.engine_label, "total_files": total_files, "results": results}, f, indent=2, + ensure_ascii=False, ) print(f"\nResults saved to {output_file}") diff --git a/openviking/storage/ovpack/vectors.py b/openviking/storage/ovpack/vectors.py index 886704745d..5599e87e84 100644 --- a/openviking/storage/ovpack/vectors.py +++ b/openviking/storage/ovpack/vectors.py @@ -295,7 +295,7 @@ async def _upsert_vector_snapshot_record( if not payload.get("abstract"): payload["abstract"] = str(record.get("text") or "") if "content" not in payload: - payload["content"] = str(record.get("text") or "")[:65536] + payload["content"] = str(record.get("text") or "") try: await vector_store.upsert(payload, ctx=ctx) diff --git a/openviking/storage/queuefs/embedding_msg_converter.py b/openviking/storage/queuefs/embedding_msg_converter.py index 6cda82b4e4..9a579344d4 100644 --- a/openviking/storage/queuefs/embedding_msg_converter.py +++ b/openviking/storage/queuefs/embedding_msg_converter.py @@ -70,9 +70,8 @@ def from_context(context: Context) -> EmbeddingMsg: # Store full content in content field for bm25 full-text search. # Use full_text (raw file content) when available; fall back to vectorization_text. - # Truncate to 64KB (VikingDB text field limit). full_content = context.vectorize.full_text or vectorization_text - context_data["content"] = full_content[:65536] + context_data["content"] = full_content embedding_msg = EmbeddingMsg( message=vectorization_text, diff --git a/openviking/storage/vectordb_adapters/base.py b/openviking/storage/vectordb_adapters/base.py index 3733a0ebad..4599fbf965 100644 --- a/openviking/storage/vectordb_adapters/base.py +++ b/openviking/storage/vectordb_adapters/base.py @@ -31,6 +31,29 @@ logger = get_logger(__name__) +# --------------------------------------------------------------------------- +# VikingDB text field byte limit +# --------------------------------------------------------------------------- +# VikingDB rejects upsert when any text field exceeds this byte length. +# Truncation is applied at a valid UTF-8 character boundary so that +# multi-byte sequences are never split in the middle. +VIKINGDB_TEXT_FIELD_BYTE_LIMIT: int = 65535 + + +def _truncate_text_field(text: str, byte_limit: int = VIKINGDB_TEXT_FIELD_BYTE_LIMIT) -> str: + """Truncate *text* so its UTF-8 encoding does not exceed *byte_limit*. + + Walks backwards from *byte_limit* to find the nearest valid UTF-8 lead + byte, ensuring no multi-byte character is split. + """ + encoded = text.encode("utf-8") + if len(encoded) <= byte_limit: + return text + cut = byte_limit + while cut > 0 and (encoded[cut] & 0xC0) == 0x80: + cut -= 1 + return encoded[:cut].decode("utf-8") + def _parse_url(url: str) -> tuple[str, int]: normalized = url @@ -64,6 +87,13 @@ class CollectionAdapter(ABC): mode: str _URI_FIELD_NAMES = {"uri", "parent_uri"} + # Text fields subject to byte-limit truncation before upsert. + _TRUNCATABLE_TEXT_FIELDS: tuple[str, ...] = ("content", "abstract") + + # Per-backend byte limit for text fields. ``None`` means no truncation. + # Subclasses backed by VikingDB should set this to ``VIKINGDB_TEXT_FIELD_BYTE_LIMIT``. + _TEXT_FIELD_BYTE_LIMIT: int | None = None + def __init__(self, collection_name: str, index_name: str = DEFAULT_INDEX_NAME): self._collection_name = collection_name self._index_name = index_name @@ -214,6 +244,11 @@ def _normalize_record_for_write(self, record: Dict[str, Any]) -> Dict[str, Any]: for key in self._URI_FIELD_NAMES: if key in normalized: normalized[key] = self._encode_uri_field_value(normalized[key]) + if self._TEXT_FIELD_BYTE_LIMIT is not None: + for field in self._TRUNCATABLE_TEXT_FIELDS: + value = normalized.get(field) + if isinstance(value, str): + normalized[field] = _truncate_text_field(value, self._TEXT_FIELD_BYTE_LIMIT) return normalized @staticmethod diff --git a/openviking/storage/vectordb_adapters/http_adapter.py b/openviking/storage/vectordb_adapters/http_adapter.py index ce9d96e610..f5ad0f208e 100644 --- a/openviking/storage/vectordb_adapters/http_adapter.py +++ b/openviking/storage/vectordb_adapters/http_adapter.py @@ -13,12 +13,19 @@ list_vikingdb_collections, ) -from .base import CollectionAdapter, _normalize_collection_names, _parse_url +from .base import ( + VIKINGDB_TEXT_FIELD_BYTE_LIMIT, + CollectionAdapter, + _normalize_collection_names, + _parse_url, +) class HttpCollectionAdapter(CollectionAdapter): """Adapter for remote HTTP vectordb project.""" + _TEXT_FIELD_BYTE_LIMIT = VIKINGDB_TEXT_FIELD_BYTE_LIMIT + def __init__( self, host: str, diff --git a/openviking/storage/vectordb_adapters/vikingdb_private_adapter.py b/openviking/storage/vectordb_adapters/vikingdb_private_adapter.py index 2342d57ba4..b245814ba1 100644 --- a/openviking/storage/vectordb_adapters/vikingdb_private_adapter.py +++ b/openviking/storage/vectordb_adapters/vikingdb_private_adapter.py @@ -10,12 +10,14 @@ from openviking.storage.vectordb.collection.vikingdb_clients import VIKINGDB_APIS, VikingDBClient from openviking.storage.vectordb.collection.vikingdb_collection import VikingDBCollection -from .base import CollectionAdapter +from .base import VIKINGDB_TEXT_FIELD_BYTE_LIMIT, CollectionAdapter class VikingDBPrivateCollectionAdapter(CollectionAdapter): """Adapter for private VikingDB deployment.""" + _TEXT_FIELD_BYTE_LIMIT = VIKINGDB_TEXT_FIELD_BYTE_LIMIT + def __init__( self, *, diff --git a/openviking/storage/vectordb_adapters/volcengine_adapter.py b/openviking/storage/vectordb_adapters/volcengine_adapter.py index 133ac92b85..fb483fa758 100644 --- a/openviking/storage/vectordb_adapters/volcengine_adapter.py +++ b/openviking/storage/vectordb_adapters/volcengine_adapter.py @@ -15,12 +15,14 @@ get_or_create_volcengine_collection, ) -from .base import CollectionAdapter +from .base import VIKINGDB_TEXT_FIELD_BYTE_LIMIT, CollectionAdapter class VolcengineCollectionAdapter(CollectionAdapter): """Adapter for Volcengine-hosted VikingDB.""" + _TEXT_FIELD_BYTE_LIMIT = VIKINGDB_TEXT_FIELD_BYTE_LIMIT + def __init__( self, *, From f13617b8fedae3029e9f28891221bb70788ec4db Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Tue, 2 Jun 2026 10:59:11 +0800 Subject: [PATCH 18/31] optimize: effectiveness add resource plainly --- .../effectiveness/step1_add_resource.py | 68 ++++++++----------- .../effectiveness/step3_quality.py | 1 + 2 files changed, 29 insertions(+), 40 deletions(-) diff --git a/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step1_add_resource.py b/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step1_add_resource.py index 11b04e9c91..b0bf9bafcd 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step1_add_resource.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step1_add_resource.py @@ -1,9 +1,12 @@ #!/usr/bin/env python3 """Step 1 (Effectiveness): Import real code repos into OpenViking (no indexing). -Recursively scans a local directory, imports each subdirectory (at all depths) -via SyncOpenViking.add_resource (wait=True, build_index=False, summarize=False), -and saves progress after each directory for resumability. +Scans first-level subdirectories under the source directory and imports each +as a whole repo via SyncOpenViking.add_resource (wait=True, build_index=False, +summarize=False). add_resource itself handles recursive traversal of files +within each repo, so we only need to enumerate top-level directories. + +Progress is saved after each directory for resumability. After all imports are done, run step2_reindex.py to build vector indexes, then step3_quality.py to evaluate retrieval quality. @@ -42,27 +45,13 @@ def save_progress(rel_dir: str) -> None: f.write(rel_dir + "\n") -def scan_subdirs_recursive(root: str) -> list[str]: - """Return sorted list of all subdirectory relative paths (deterministic order).""" - result: list[str] = [] - - def _walk(dir_path: str, rel_prefix: str) -> None: - try: - entries = sorted(os.listdir(dir_path)) - except OSError: - return - for name in entries: - if name.startswith("."): - continue - full = os.path.join(dir_path, name) - if not os.path.isdir(full): - continue - rel = f"{rel_prefix}/{name}" if rel_prefix else name - result.append(rel) - _walk(full, rel) - - _walk(root, "") - return result +def scan_first_level_dirs(root: str) -> list[str]: + """Return sorted list of first-level subdirectory names.""" + try: + entries = sorted(os.listdir(root)) + except OSError: + return [] + return [e for e in entries if not e.startswith(".") and os.path.isdir(os.path.join(root, e))] def main(): @@ -95,9 +84,9 @@ def main(): print(" Indexing: DISABLED (build_index=False, summarize=False)") print() - subdirs = scan_subdirs_recursive(source) - total = len(subdirs) - print(f" Total directories to import: {total}") + first_level = scan_first_level_dirs(source) + total = len(first_level) + print(f" First-level directories to import: {total}") print() if total == 0: @@ -106,7 +95,7 @@ def main(): completed = load_progress() if completed: - already_done = [d for d in subdirs if d in completed] + already_done = [d for d in first_level if d in completed] print(f" Resuming: {len(already_done)} directories already imported") print() @@ -114,22 +103,21 @@ def main(): client.initialize() results = [] - for i, rel_dir in enumerate(subdirs, 1): - if rel_dir in completed: - print(f" [{i}/{total}] SKIP (already done): {rel_dir}") + for i, name in enumerate(first_level, 1): + if name in completed: + print(f" [{i}/{total}] SKIP (already done): {name}") continue - dir_path = os.path.join(source, rel_dir) - parent_rel = os.path.dirname(rel_dir) - parent_uri = f"{args.parent}/{parent_rel}" if parent_rel else args.parent - print(f" [{i}/{total}] Importing: {rel_dir} ...", end="", flush=True) + dir_path = os.path.join(source, name) + parent_uri = args.parent + print(f" [{i}/{total}] Importing: {name} ...", end="", flush=True) t0 = time.monotonic() try: result = client.add_resource( path=dir_path, parent=parent_uri, - reason=f"benchmark effectiveness: {rel_dir}", + reason=f"benchmark effectiveness: {name}", wait=True, create_parent=True, build_index=False, @@ -138,14 +126,14 @@ def main(): elapsed = time.monotonic() - t0 root_uri = result.get("root_uri", "?") print(f" OK ({elapsed:.1f}s) -> {root_uri}") - save_progress(rel_dir) - results.append({"dir": rel_dir, "status": "ok", "elapsed_s": round(elapsed, 1)}) + save_progress(name) + results.append({"dir": name, "status": "ok", "elapsed_s": round(elapsed, 1)}) except Exception as e: elapsed = time.monotonic() - t0 print(f" FAILED ({elapsed:.1f}s): {e}") results.append( { - "dir": rel_dir, + "dir": name, "status": "failed", "elapsed_s": round(elapsed, 1), "error": str(e)[:500], @@ -167,7 +155,7 @@ def main(): ok_count = sum(1 for r in results if r["status"] == "ok") failed_count = sum(1 for r in results if r["status"] == "failed") - skipped_count = sum(1 for d in subdirs if d in completed) + skipped_count = sum(1 for d in first_level if d in completed) total_done = skipped_count + ok_count print() diff --git a/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step3_quality.py b/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step3_quality.py index 5fa249a208..16f74bca7b 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step3_quality.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step3_quality.py @@ -296,6 +296,7 @@ def main(): print() # Save results to local file + os.makedirs(RESULT_DIR, exist_ok=True) output_file = os.path.join(RESULT_DIR, "step3_result.json") with open(output_file, "w", encoding="utf-8") as f: json.dump( From 0fe3c1e1868d44450bf5a8f5dc6377c8ed364c83 Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Wed, 3 Jun 2026 17:25:24 +0800 Subject: [PATCH 19/31] optimize: change param use of SearchByKeywords from "keywords" to "query" --- openviking/storage/viking_fs.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index c4323b23f7..ef87df514a 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -933,9 +933,10 @@ async def _grep_vikingdb_then_fs( """VikingDB bm25 recall + local fs precise matching.""" vector_store = self._get_vector_store() - # Split regex alternation (e.g. "error|warning|fail") into individual keywords - # for bm25 search. Limit to 10 keywords per VikingDB API constraint. - keywords = [kw.strip() for kw in pattern.split("|") if kw.strip()][:10] + # Split regex alternation (e.g. "error|warning|fail") and join as a + # single query string for bm25 search. VikingDB's standard tokenizer + # will handle the tokenization of the query string. + query = " ".join(kw.strip() for kw in pattern.split("|") if kw.strip()) filter_expr = PathScope("uri", uri, depth=level_limit) # Auto-adapt remote_return_limit: when 0 (default), use the maximum @@ -948,7 +949,7 @@ async def _grep_vikingdb_then_fs( # Step 1: vikingdb recall candidate files try: result = await vector_store.search_by_keywords( - keywords=keywords, + query=query, limit=remote_return_limit, filter=filter_expr, output_fields=["uri"], From 02d18d8793a6edb57591e098fa667dce6434220b Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Thu, 4 Jun 2026 15:59:49 +0800 Subject: [PATCH 20/31] optimize(benchmark): refactor effectiveness scripts --- .../retrieval/grep/vikingdb_bm25/README.md | 22 +- .../retrieval/grep/vikingdb_bm25/README_CN.md | 22 +- .../effectiveness/step1_add_resource.py | 144 +++-------- .../{step3_quality.py => step2_quality.py} | 15 +- .../effectiveness/step2_reindex.py | 231 ------------------ 5 files changed, 51 insertions(+), 383 deletions(-) rename benchmark/retrieval/grep/vikingdb_bm25/effectiveness/{step3_quality.py => step2_quality.py} (95%) delete mode 100644 benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step2_reindex.py diff --git a/benchmark/retrieval/grep/vikingdb_bm25/README.md b/benchmark/retrieval/grep/vikingdb_bm25/README.md index 9c70d998e6..cf033c616c 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/README.md +++ b/benchmark/retrieval/grep/vikingdb_bm25/README.md @@ -9,8 +9,7 @@ vikingdb_bm25/ ├── ai_wiki.txt # Source text for synthetic data generation ├── effectiveness/ # Retrieval effectiveness (recall/precision/F1) │ ├── step1_add_resource.py -│ ├── step2_reindex.py -│ └── step3_quality.py +│ └── step2_quality.py └── performance/ # Retrieval performance (latency + recall at scale) ├── step0_prepare_data.py ├── step1_add_resource.py @@ -26,31 +25,26 @@ Tests whether grep can find **all** matching files in real code repositories. | Step | Script | Description | |------|--------|-------------| -| 1 | `step1_add_resource.py` | Import code repos (no indexing, fast) | -| 2 | `step2_reindex.py` | Async reindex via openviking-server (concurrency=16, polling) | -| 3 | `step3_quality.py` | Compare grep results vs ground truth (fs engine, cached) | +| 1 | `step1_add_resource.py` | Import code repos (with indexing, single import) | +| 2 | `step2_quality.py` | Compare grep results vs ground truth (fs engine, cached) | ### Usage ```bash -# Step 1: Import repos (no VLM/embedding) +# Step 1: Import repos (with VLM/embedding, single import) cd effectiveness/ python3 step1_add_resource.py --source ~/.openviking/data/benchmark/OpenViking-main -# Step 2: Build vector indexes (requires openviking-server running) -python3 step2_reindex.py -# Optional: --concurrency N (default: 16) - -# Step 3: Evaluate retrieval quality +# Step 2: Evaluate retrieval quality # First run MUST use engine=fs in ov.conf to generate ground truth cache: # 1. Set ov.conf: "grep": {"engine": "fs"} # 2. Restart server -python3 step3_quality.py --keywords grep reindex SyncHTTPClient +python3 step2_quality.py --keywords grep reindex SyncHTTPClient # Subsequent runs can use any engine (ground truth is read from cache): # 1. Set ov.conf: "grep": {"engine": "auto", "switch_to_remote_threshold": 0} # 2. Restart server -python3 step3_quality.py --keywords grep reindex SyncHTTPClient +python3 step2_quality.py --keywords grep reindex SyncHTTPClient # Optional: --regenerate-ground-truth (force recompute, requires engine=fs) ``` @@ -112,6 +106,6 @@ python3 step3_benchmark.py --engine-label auto --compare step3_result_fs.json - **Performance** tests compare grep latency and match counts between engine configs (ground truth from fs-engine grep, cached) - Both ground truth caches are stored in `~/.openviking/data/benchmark/.ground_truth/` - First run of each step3 MUST use `engine=fs` in ov.conf to generate ground truth; subsequent runs can use any engine -- Both follow the same workflow: import (no indexing) → reindex → benchmark/evaluate +- Both follow the same workflow: import (with indexing) → benchmark/evaluate - Both support **resumable** execution via progress files (separate for import and reindex) - Change grep engine via `ov.conf` and restart the server between benchmark runs diff --git a/benchmark/retrieval/grep/vikingdb_bm25/README_CN.md b/benchmark/retrieval/grep/vikingdb_bm25/README_CN.md index 5f8f12a8af..3801855063 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/README_CN.md +++ b/benchmark/retrieval/grep/vikingdb_bm25/README_CN.md @@ -9,8 +9,7 @@ vikingdb_bm25/ ├── ai_wiki.txt # 合成数据生成的原始文本 ├── effectiveness/ # 检索效果测试(召回率/精确率/F1) │ ├── step1_add_resource.py -│ ├── step2_reindex.py -│ └── step3_quality.py +│ └── step2_quality.py └── performance/ # 检索性能测试(延迟 + 大规模召回) ├── step0_prepare_data.py ├── step1_add_resource.py @@ -26,31 +25,26 @@ vikingdb_bm25/ | 步骤 | 脚本 | 说明 | |------|------|------| -| 1 | `step1_add_resource.py` | 导入代码仓库(不建索引,速度快) | -| 2 | `step2_reindex.py` | 通过 openviking-server 异步构建索引(并发=16,轮询) | -| 3 | `step3_quality.py` | SDK grep 与 fs 引擎 ground truth 对比(缓存) | +| 1 | `step1_add_resource.py` | 导入代码仓库(含建索引,一次性导入) | +| 2 | `step2_quality.py` | SDK grep 与 fs 引擎 ground truth 对比(缓存) | ### 使用方法 ```bash -# 步骤 1:导入代码仓库(不建索引) +# 步骤 1:导入代码仓库(含建索引,一次性导入) cd effectiveness/ python3 step1_add_resource.py --source ~/.openviking/data/benchmark/OpenViking-main -# 步骤 2:构建向量索引(需 openviking-server 运行中) -python3 step2_reindex.py -# 可选参数:--concurrency N (默认:16) - -# 步骤 3:评估检索质量 +# 步骤 2:评估检索质量 # 首次运行必须使用 engine=fs 生成 ground truth 缓存: # 1. 设置 ov.conf: "grep": {"engine": "fs"} # 2. 重启服务 -python3 step3_quality.py --keywords grep reindex SyncHTTPClient +python3 step2_quality.py # 后续运行可使用任意引擎(ground truth 从缓存读取): # 1. 设置 ov.conf: "grep": {"engine": "auto", "switch_to_remote_threshold": 0} # 2. 重启服务 -python3 step3_quality.py --keywords grep reindex SyncHTTPClient +python3 step2_quality.py # 可选参数:--regenerate-ground-truth (强制重算,需 engine=fs) ``` @@ -112,6 +106,6 @@ python3 step3_benchmark.py --engine-label auto --compare step3_result_fs.json - **Performance(性能测试)** 对比不同引擎的延迟和匹配数(ground truth 来自 fs 引擎,本地缓存) - 两个 ground truth 缓存均存储在 `~/.openviking/data/benchmark/.ground_truth/` - 每个 step3 首次运行必须使用 ov.conf 的 `engine=fs` 来生成 ground truth;后续运行可使用任意引擎 -- 两者遵循相同流程:导入(不建索引)→ 构建索引 → 评估/测试 +- 两者遵循相同流程:导入(含建索引)→ 评估/测试 - 两者均支持**断点续传**(导入和索引各有独立进度文件) - 切换 grep 引擎需修改 `ov.conf` 并重启服务,在不同运行之间对比 diff --git a/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step1_add_resource.py b/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step1_add_resource.py index b0bf9bafcd..e8e18c08b4 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step1_add_resource.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step1_add_resource.py @@ -1,15 +1,11 @@ #!/usr/bin/env python3 -"""Step 1 (Effectiveness): Import real code repos into OpenViking (no indexing). +"""Step 1 (Effectiveness): Import real code repos into OpenViking (with indexing). -Scans first-level subdirectories under the source directory and imports each -as a whole repo via SyncOpenViking.add_resource (wait=True, build_index=False, -summarize=False). add_resource itself handles recursive traversal of files -within each repo, so we only need to enumerate top-level directories. +Imports the entire source directory as a single resource via +SyncOpenViking.add_resource (wait=True, build_index=True, summarize=True). +add_resource handles recursive traversal internally. -Progress is saved after each directory for resumability. - -After all imports are done, run step2_reindex.py to build vector indexes, -then step3_quality.py to evaluate retrieval quality. +After import, run step2_quality.py to evaluate retrieval quality. Prerequisites: - Download code repos and place them under the source directory manually. @@ -28,35 +24,12 @@ from openviking.sync_client import SyncOpenViking DEFAULT_SOURCE = os.path.expanduser("~/.openviking/data/benchmark/OpenViking-main") -PROGRESS_FILE = os.path.expanduser("~/.openviking/data/benchmark/.effectiveness-import-progress") BENCHMARK_PARENT = "viking://resources/benchmark/effectiveness" -def load_progress() -> set[str]: - if not os.path.exists(PROGRESS_FILE): - return set() - with open(PROGRESS_FILE) as f: - return {line.strip() for line in f if line.strip()} - - -def save_progress(rel_dir: str) -> None: - os.makedirs(os.path.dirname(PROGRESS_FILE), exist_ok=True) - with open(PROGRESS_FILE, "a") as f: - f.write(rel_dir + "\n") - - -def scan_first_level_dirs(root: str) -> list[str]: - """Return sorted list of first-level subdirectory names.""" - try: - entries = sorted(os.listdir(root)) - except OSError: - return [] - return [e for e in entries if not e.startswith(".") and os.path.isdir(os.path.join(root, e))] - - def main(): parser = argparse.ArgumentParser( - description="Step 1 (Effectiveness): Import real code repos (no indexing)" + description="Step 1 (Effectiveness): Import real code repos (with indexing)" ) parser.add_argument( "--source", @@ -76,100 +49,39 @@ def main(): return print("=" * 80) - print("Step 1 (Effectiveness): Import Code Repos (no VLM/embedding)") + print("Step 1 (Effectiveness): Import Code Repos (with VLM/embedding)") print("=" * 80) print(f" Source: {source}") print(f" Parent: {args.parent}") - print(f" Progress: {PROGRESS_FILE}") - print(" Indexing: DISABLED (build_index=False, summarize=False)") + print(" Indexing: ENABLED (build_index=True, summarize=True)") print() - first_level = scan_first_level_dirs(source) - total = len(first_level) - print(f" First-level directories to import: {total}") - print() - - if total == 0: - print("No subdirectories found. Nothing to import.") - return - - completed = load_progress() - if completed: - already_done = [d for d in first_level if d in completed] - print(f" Resuming: {len(already_done)} directories already imported") - print() - client = SyncOpenViking() client.initialize() - results = [] - for i, name in enumerate(first_level, 1): - if name in completed: - print(f" [{i}/{total}] SKIP (already done): {name}") - continue - - dir_path = os.path.join(source, name) - parent_uri = args.parent - print(f" [{i}/{total}] Importing: {name} ...", end="", flush=True) - - t0 = time.monotonic() - try: - result = client.add_resource( - path=dir_path, - parent=parent_uri, - reason=f"benchmark effectiveness: {name}", - wait=True, - create_parent=True, - build_index=False, - summarize=False, - ) - elapsed = time.monotonic() - t0 - root_uri = result.get("root_uri", "?") - print(f" OK ({elapsed:.1f}s) -> {root_uri}") - save_progress(name) - results.append({"dir": name, "status": "ok", "elapsed_s": round(elapsed, 1)}) - except Exception as e: - elapsed = time.monotonic() - t0 - print(f" FAILED ({elapsed:.1f}s): {e}") - results.append( - { - "dir": name, - "status": "failed", - "elapsed_s": round(elapsed, 1), - "error": str(e)[:500], - } - ) + t0 = time.monotonic() + try: + result = client.add_resource( + path=source, + parent=args.parent, + reason="benchmark effectiveness", + wait=True, + create_parent=True, + build_index=True, + summarize=True, + ) + elapsed = time.monotonic() - t0 + root_uri = result.get("root_uri", "?") + print(f"OK ({elapsed:.1f}s) -> {root_uri}") + print() + print("Import completed successfully.") + print("Next step: run step2_quality.py to evaluate retrieval quality") + except Exception as e: + elapsed = time.monotonic() - t0 + print(f"FAILED ({elapsed:.1f}s): {e}") client.close() - print() - print("Summary:") - for r in results: - status = r["status"] - dir_name = r["dir"] - elapsed = r["elapsed_s"] - line = f" {status.upper():>7s} {dir_name} ({elapsed}s)" - if status == "failed": - line += f" -- {r.get('error', '')}" - print(line) - - ok_count = sum(1 for r in results if r["status"] == "ok") - failed_count = sum(1 for r in results if r["status"] == "failed") - skipped_count = sum(1 for d in first_level if d in completed) - total_done = skipped_count + ok_count - - print() - if total_done >= total and failed_count == 0: - print(f"All {total} directories imported successfully (no indexing).") - print("Next step: run step2_reindex.py to build vector indexes") - else: - print( - f" Imported: {ok_count} Failed: {failed_count} " - f"Skipped: {skipped_count} Remaining: {total - total_done}" - ) - if failed_count > 0: - print("Re-run this script to resume from where it left off.") - if __name__ == "__main__": main() diff --git a/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step3_quality.py b/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step2_quality.py similarity index 95% rename from benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step3_quality.py rename to benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step2_quality.py index 16f74bca7b..c4da2ceb16 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step3_quality.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step2_quality.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Step 3 (Effectiveness): Evaluate retrieval quality for real code repos. +"""Step 2 (Effectiveness): Evaluate retrieval quality for real code repos. Compares grep results (current engine) against ground truth from fs-engine grep. Computes Recall, Precision, F1 per query pattern. @@ -10,12 +10,11 @@ ground truth. Prerequisites: - 1. Run step1_add_resource.py to import repos (no indexing) - 2. Run step2_reindex.py to build vector indexes - 3. First run: set ov.conf grep engine to "fs" and restart server + 1. Run step1_add_resource.py to import repos (with indexing) + 2. First run: set ov.conf grep engine to "fs" and restart server Usage: - python3 step3_quality.py --keywords grep reindex SyncHTTPClient + python3 step2_quality.py --keywords grep reindex SyncHTTPClient """ from __future__ import annotations @@ -176,7 +175,7 @@ def main(): import argparse parser = argparse.ArgumentParser( - description="Step 3 (Effectiveness): Evaluate retrieval quality" + description="Step 2 (Effectiveness): Evaluate retrieval quality" ) parser.add_argument( "--keywords", @@ -195,7 +194,7 @@ def main(): keywords = args.keywords if args.keywords else KEYWORDS if not keywords: print("WARNING: KEYWORDS list is empty. Fill it with real terms before running.") - print(" Use --keywords kw1 kw2 ... or edit step3_quality.py.\n") + print(" Use --keywords kw1 kw2 ... or edit step2_quality.py.\n") test_patterns = build_test_patterns(keywords) @@ -297,7 +296,7 @@ def main(): # Save results to local file os.makedirs(RESULT_DIR, exist_ok=True) - output_file = os.path.join(RESULT_DIR, "step3_result.json") + output_file = os.path.join(RESULT_DIR, "step2_result.json") with open(output_file, "w", encoding="utf-8") as f: json.dump( {"uri": uri, "patterns": len(test_patterns), "results": results}, diff --git a/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step2_reindex.py b/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step2_reindex.py deleted file mode 100644 index 775c43da7c..0000000000 --- a/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step2_reindex.py +++ /dev/null @@ -1,231 +0,0 @@ -#!/usr/bin/env python3 -"""Step 2 (Effectiveness): Build vector indexes for imported code repos. - -Submits async reindex tasks for each first-level subdirectory via -SyncHTTPClient.reindex(wait=False), with a concurrency limit of 2 -running tasks. When a task completes, the next one is submitted. -This avoids tree-lock conflicts and prevents resource exhaustion. - -Prerequisites: - 1. Run step1_add_resource.py to import repos (without indexing) - 2. Start openviking-server manually - -Usage: - python3 step2_reindex.py -""" - -from __future__ import annotations - -import argparse -import os -import time - -from openviking_cli.client.sync_http import SyncHTTPClient - -DEFAULT_SOURCE = os.path.expanduser("~/.openviking/data/benchmark/OpenViking-main") -PROGRESS_FILE = os.path.expanduser("~/.openviking/data/benchmark/.effectiveness-reindex-progress") -BENCHMARK_PARENT = "viking://resources/benchmark/effectiveness" - -POLL_INTERVAL = 5 # seconds between task status checks -MAX_CONCURRENT = 16 # max running tasks at a time - - -def load_progress() -> set[str]: - if not os.path.exists(PROGRESS_FILE): - return set() - with open(PROGRESS_FILE) as f: - return {line.strip() for line in f if line.strip()} - - -def save_progress(rel_dir: str) -> None: - os.makedirs(os.path.dirname(PROGRESS_FILE), exist_ok=True) - with open(PROGRESS_FILE, "a") as f: - f.write(rel_dir + "\n") - - -def scan_first_level_dirs(root: str) -> list[str]: - """Return sorted list of first-level subdirectory names.""" - try: - entries = sorted(os.listdir(root)) - except OSError: - return [] - return [e for e in entries if not e.startswith(".") and os.path.isdir(os.path.join(root, e))] - - -def main(): - parser = argparse.ArgumentParser( - description="Step 2 (Effectiveness): Build vector indexes via openviking-server" - ) - parser.add_argument( - "--source", - default=DEFAULT_SOURCE, - help=f"Local source directory (must match step1, default: {DEFAULT_SOURCE})", - ) - parser.add_argument( - "--parent", - default=BENCHMARK_PARENT, - help=f"Parent Viking URI (default: {BENCHMARK_PARENT})", - ) - parser.add_argument( - "--concurrency", - type=int, - default=MAX_CONCURRENT, - help=f"Max concurrent reindex tasks (default: {MAX_CONCURRENT})", - ) - args = parser.parse_args() - - source = os.path.expanduser(args.source) - max_concurrent = max(1, args.concurrency) - - print("=" * 80) - print("Step 2 (Effectiveness): Build Vector Indexes (via openviking-server)") - print("=" * 80) - print(f" Source: {source}") - print(f" Parent: {args.parent}") - print(f" Progress: {PROGRESS_FILE}") - print(" Mode: vectors_only (wait=False, async)") - print(f" Concurrency: {max_concurrent}") - print() - print(" Prerequisite: openviking-server must be running!") - print() - - # Scan first-level dirs only - first_level = scan_first_level_dirs(source) - total = len(first_level) - print(f" First-level directories to reindex: {total}") - print() - - if total == 0: - print("No subdirectories found. Run step1_add_resource.py first.") - return - - completed = load_progress() - if completed: - already_done = [d for d in first_level if d in completed] - print(f" Resuming: {len(already_done)} directories already reindexed") - print() - - client = SyncHTTPClient(account="default", user="default") - client.initialize() - - # Build work queue (skip already completed) - work_queue: list[str] = [name for name in first_level if name not in completed] - skipped_count = len(first_level) - len(work_queue) - - # running: task_id -> (name, submit_time) - running: dict[str, tuple[str, float]] = {} - results: list[dict] = [] - - def _submit_next() -> bool: - """Submit the next item from work_queue if slot available. Returns True if submitted.""" - if not work_queue or len(running) >= max_concurrent: - return False - name = work_queue.pop(0) - dir_uri = f"{args.parent}/{name}" - idx = total - len(work_queue) - print(f" [{idx}/{total}] Submitting: {name} ...", end="", flush=True) - try: - result = client.reindex(uri=dir_uri, mode="vectors_only", wait=False) - task_id = result.get("task_id", "") - if task_id: - print(f" task_id={task_id[:8]}...") - running[task_id] = (name, time.monotonic()) - else: - print(" completed synchronously") - save_progress(name) - results.append({"dir": name, "status": "ok", "elapsed_s": 0.0}) - return True - except Exception as e: - print(f" FAILED: {e}") - results.append({"dir": name, "status": "failed", "error": str(e)[:500]}) - return True - - # Fill initial slots - while len(running) < max_concurrent and work_queue: - _submit_next() - - if not running and not results: - client.close() - _print_summary(results, skipped_count, first_level) - return - - # Poll loop: check running tasks, submit new ones as slots free up - print() - print(f" Running {len(running)} tasks, {len(work_queue)} queued") - print() - - while running: - done_ids = [] - for task_id, (name, submit_time) in list(running.items()): - try: - task_info = client.get_task(task_id) - except Exception: - continue - if task_info is None: - continue - status = task_info.get("status", "") - if status in ("completed", "failed"): - elapsed = time.monotonic() - submit_time - if status == "completed": - print(f" DONE {name} ({elapsed:.1f}s)") - save_progress(name) - results.append({"dir": name, "status": "ok", "elapsed_s": round(elapsed, 1)}) - else: - error = task_info.get("error", "unknown error") - print(f" FAIL {name} ({elapsed:.1f}s): {error}") - results.append( - { - "dir": name, - "status": "failed", - "elapsed_s": round(elapsed, 1), - "error": error, - } - ) - done_ids.append(task_id) - - for tid in done_ids: - del running[tid] - - # Fill freed slots - while len(running) < max_concurrent and work_queue: - _submit_next() - - if running: - time.sleep(POLL_INTERVAL) - - client.close() - _print_summary(results, skipped_count, first_level) - - -def _print_summary(results: list[dict], skipped_count: int, all_dirs: list[str]) -> None: - print() - print("Summary:") - ok_count = sum(1 for r in results if r.get("status") == "ok") - failed_count = sum(1 for r in results if r.get("status") == "failed") - total_done = skipped_count + ok_count - - for r in results: - status = r.get("status", "unknown") - line = f" {status.upper():>7s} {r.get('dir', '?')}" - if "elapsed_s" in r: - line += f" ({r['elapsed_s']}s)" - if status == "failed": - line += f" -- {r.get('error', '')}" - print(line) - - print() - total = len(all_dirs) - if total_done >= total and failed_count == 0: - print(f"All {total} directories reindexed successfully.") - print("Next step: run step3_quality.py to evaluate retrieval quality") - else: - print( - f" Reindexed: {ok_count} Failed: {failed_count} " - f"Skipped: {skipped_count} Remaining: {total - total_done}" - ) - if failed_count > 0: - print("Re-run this script to resume from where it left off.") - - -if __name__ == "__main__": - main() From 0c8eef0d31738b5567807132588a6858d342543b Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Thu, 11 Jun 2026 12:12:30 +0800 Subject: [PATCH 21/31] optimize: ensure raw data for content field --- .../storage/viking_vector_index_backend.py | 6 + openviking/utils/embedding_utils.py | 85 ++++++++++----- tests/storage/test_collection_schemas.py | 47 ++++++++ .../test_embedding_msg_converter_tenant.py | 14 ++- tests/unit/test_vectorize_file_strategy.py | 103 ++++++++++++++++++ 5 files changed, 226 insertions(+), 29 deletions(-) diff --git a/openviking/storage/viking_vector_index_backend.py b/openviking/storage/viking_vector_index_backend.py index 1310e50ac7..54e4aa7d9b 100644 --- a/openviking/storage/viking_vector_index_backend.py +++ b/openviking/storage/viking_vector_index_backend.py @@ -74,6 +74,8 @@ "account_id", ] +VIKINGDB_CONTENT_MAX_SIZE = 64 * 1024 + class _AsyncVectorAdapter: """Thread-offloaded facade for sync vector adapters.""" @@ -171,6 +173,10 @@ def _prepare_upsert_payload(self, data: Dict[str, Any]) -> Dict[str, Any]: except Exception: pass + content = result.get("content") + if isinstance(content, (str, bytes)): + result["content"] = content[:VIKINGDB_CONTENT_MAX_SIZE] + return result async def _refresh_meta_data_async(self) -> None: diff --git a/openviking/utils/embedding_utils.py b/openviking/utils/embedding_utils.py index 3f416a0012..52b114fecd 100644 --- a/openviking/utils/embedding_utils.py +++ b/openviking/utils/embedding_utils.py @@ -11,6 +11,8 @@ from datetime import datetime, timezone from typing import Any, Dict, Optional +from charset_normalizer import from_bytes + from openviking.core.context import Context, ContextLevel, ResourceContentType, Vectorize from openviking.core.namespace import context_type_for_uri, owner_space_for_uri from openviking.server.identity import RequestContext @@ -231,6 +233,41 @@ async def _build_image_data_uri( return None +def _coerce_text_file_content(raw: Any) -> str: + """Coerce known text-file content returned by VikingFS into str.""" + if isinstance(raw, bytes): + return raw.decode("utf-8", errors="replace") + return raw or "" + + +def _decode_unknown_file_bytes(raw: bytes) -> str: + """Decode unknown file bytes with charset sniffing for BM25 content.""" + if not raw: + return "" + best = from_bytes(raw).best() + if best is not None: + return str(best) + return raw.decode("utf-8", errors="replace") + + +async def _read_unknown_file_text_for_fulltext( + file_path: str, + viking_fs, + ctx: Optional[RequestContext], +) -> str: + """Best-effort raw file text for BM25/full-text indexing. + + This text is intentionally separate from the embedding input. Unknown file + types may still embed their generated summary, while grep/BM25 should index + the original text when the file can be read as text-like content. + """ + try: + return _decode_unknown_file_bytes(await viking_fs.read_file_bytes(file_path, ctx=ctx)) + except Exception as e: + logger.debug(f"Failed to read full-text content for {file_path}: {e}") + return "" + + async def vectorize_directory_meta( uri: str, abstract: str, @@ -398,42 +435,34 @@ async def vectorize_file( logger.warning( f"Unsupported file type for {file_path}, falling back to summary for vectorization" ) - context.set_vectorize(Vectorize(text=summary, full_text=summary)) + full_content = await _read_unknown_file_text_for_fulltext(file_path, viking_fs, ctx) + context.set_vectorize(Vectorize(text=summary, full_text=full_content or summary)) else: logger.warning( f"Unsupported file type for {file_path} and no summary available, skipping vectorization" ) return elif content_type == ResourceContentType.TEXT: - if summary and effective_text_source in {"summary_first", "summary_only"}: - # Use summary for vectorization, but store full file content for BM25. - full_content = "" - try: - raw = await viking_fs.read_file(file_path, ctx=ctx) - if isinstance(raw, bytes): - raw = raw.decode("utf-8", errors="replace") - full_content = raw - except Exception: - pass - context.set_vectorize(Vectorize(text=summary, full_text=full_content or summary)) + # Known text files use VikingFS' text read path once, then reuse that + # content for BM25 regardless of whether embedding uses summary or raw text. + try: + content = _coerce_text_file_content(await viking_fs.read_file(file_path, ctx=ctx)) + except Exception as e: + logger.warning( + f"Failed to read file content for {file_path}, falling back to summary: {e}" + ) + if summary: + context.set_vectorize(Vectorize(text=summary, full_text=summary)) + else: + logger.warning(f"No summary available for {file_path}, skipping vectorization") + return else: - # Read raw file content; embedders apply their own input guard. - try: - content = await viking_fs.read_file(file_path, ctx=ctx) - if isinstance(content, bytes): - content = content.decode("utf-8", errors="replace") + if summary and effective_text_source in {"summary_first", "summary_only"}: + # Use summary for vectorization, but reuse the single raw text read for BM25. + context.set_vectorize(Vectorize(text=summary, full_text=content or summary)) + else: + # Embedders apply their own input guard. context.set_vectorize(Vectorize(text=content, full_text=content)) - except Exception as e: - logger.warning( - f"Failed to read file content for {file_path}, falling back to summary: {e}" - ) - if summary: - context.set_vectorize(Vectorize(text=summary, full_text=summary)) - else: - logger.warning( - f"No summary available for {file_path}, skipping vectorization" - ) - return elif content_type == ResourceContentType.IMAGE and image_vectorization in { "image_only", "image_and_summary", diff --git a/tests/storage/test_collection_schemas.py b/tests/storage/test_collection_schemas.py index bb9c40e680..38d23e4ea9 100644 --- a/tests/storage/test_collection_schemas.py +++ b/tests/storage/test_collection_schemas.py @@ -757,6 +757,53 @@ def upsert(self, data): } +@pytest.mark.asyncio +async def test_single_account_backend_truncates_content_only_at_vector_write(): + captured = {} + full_content = "x" * (64 * 1024 + 17) + + class _Collection: + def get_meta_data(self): + return { + "Fields": [ + {"FieldName": "id"}, + {"FieldName": "uri"}, + {"FieldName": "abstract"}, + {"FieldName": "content", "FieldType": "text"}, + {"FieldName": "account_id"}, + ] + } + + class _Adapter: + mode = "local" + + def get_collection(self): + return _Collection() + + def upsert(self, data): + captured["data"] = dict(data) + return [data["id"]] + + backend = _SingleAccountBackend( + config=VectorDBBackendConfig(backend="local", name="context", dimension=2), + bound_account_id="acc1", + shared_adapter=_Adapter(), + ) + source_data = { + "id": "rec-large", + "uri": "viking://resources/large.txt", + "abstract": "sample", + "content": full_content, + "account_id": "acc1", + } + + record_id = await backend.upsert(source_data) + + assert record_id == "rec-large" + assert source_data["content"] == full_content + assert captured["data"]["content"] == full_content[: 64 * 1024] + + @pytest.mark.asyncio async def test_single_account_backend_collection_exists_runs_in_threadpool(monkeypatch): called = {} diff --git a/tests/storage/test_embedding_msg_converter_tenant.py b/tests/storage/test_embedding_msg_converter_tenant.py index 558ddc68d9..61759548bb 100644 --- a/tests/storage/test_embedding_msg_converter_tenant.py +++ b/tests/storage/test_embedding_msg_converter_tenant.py @@ -5,7 +5,7 @@ import pytest -from openviking.core.context import Context +from openviking.core.context import Context, Vectorize from openviking.storage.queuefs.embedding_msg_converter import EmbeddingMsgConverter from openviking_cli.session.user_id import UserIdentifier @@ -45,3 +45,15 @@ def test_embedding_msg_converter_backfills_account_and_owner_fields( expected_owner_user_id(user) if callable(expected_owner_user_id) else expected_owner_user_id ) assert msg.context_data["owner_user_id"] == expected_user + + +def test_embedding_msg_converter_preserves_full_content_without_vikingdb_truncation(): + full_content = "x" * (64 * 1024 + 17) + context = Context(uri="viking://resources/large.txt", abstract="short embedding text") + context.set_vectorize(Vectorize(text="short embedding text", full_text=full_content)) + + msg = EmbeddingMsgConverter.from_context(context) + + assert msg is not None + assert msg.message == "short embedding text" + assert msg.context_data["content"] == full_content diff --git a/tests/unit/test_vectorize_file_strategy.py b/tests/unit/test_vectorize_file_strategy.py index 5389379636..885f139776 100644 --- a/tests/unit/test_vectorize_file_strategy.py +++ b/tests/unit/test_vectorize_file_strategy.py @@ -27,10 +27,19 @@ def get_queue(self, _name): class DummyFS: def __init__(self, content): self.content = content + self.read_file_calls = 0 + self.read_file_bytes_calls = 0 async def read_file(self, _path, ctx=None): + self.read_file_calls += 1 return self.content + async def read_file_bytes(self, _path, ctx=None): + self.read_file_bytes_calls += 1 + if isinstance(self.content, bytes): + return self.content + return str(self.content).encode("utf-8") + async def exists(self, _path, ctx=None): return False @@ -45,6 +54,9 @@ class DummyUser: def user_space_name(self): return "default" + def to_dict(self): + return {"account_id": self.account_id, "user_id": self.user_id} + class DummyReq: def __init__(self): @@ -82,6 +94,97 @@ async def test_vectorize_file_uses_summary_first(monkeypatch): assert queue.items[0].get_vectorization_text() == "short summary" +@pytest.mark.asyncio +async def test_vectorize_unknown_text_file_embeds_summary_but_indexes_raw_content(monkeypatch): + queue = DummyQueue() + raw_makefile = "build:\n\tcargo build --locked\n" + monkeypatch.setattr(embedding_utils, "get_queue_manager", lambda: DummyQueueManager(queue)) + monkeypatch.setattr(embedding_utils, "get_viking_fs", lambda: DummyFS(raw_makefile)) + monkeypatch.setattr( + embedding_utils, + "get_openviking_config", + lambda: types.SimpleNamespace( + embedding=types.SimpleNamespace(text_source="summary_first", max_input_tokens=1000) + ), + ) + + await embedding_utils.vectorize_file( + file_path="viking://user/default/resources/Makefile", + summary_dict={"name": "Makefile", "summary": "VLM generated build file summary"}, + parent_uri="viking://user/default/resources", + ctx=DummyReq(), + ) + + assert len(queue.items) == 1 + msg = queue.items[0] + assert msg.message == "VLM generated build file summary" + assert msg.context_data["content"] == raw_makefile + + +@pytest.mark.asyncio +async def test_vectorize_unknown_text_file_sniffs_non_utf8_raw_content(monkeypatch): + queue = DummyQueue() + raw_content = ( + "# 构建脚本\n" + "目标: 编译项目\n" + "说明: 这是一个中文 Makefile 内容,用于测试编码探测。\n" + "命令: cargo build --locked\n" + ) + fs = DummyFS(raw_content.encode("gb18030")) + monkeypatch.setattr(embedding_utils, "get_queue_manager", lambda: DummyQueueManager(queue)) + monkeypatch.setattr(embedding_utils, "get_viking_fs", lambda: fs) + monkeypatch.setattr( + embedding_utils, + "get_openviking_config", + lambda: types.SimpleNamespace( + embedding=types.SimpleNamespace(text_source="summary_first", max_input_tokens=1000) + ), + ) + + await embedding_utils.vectorize_file( + file_path="viking://user/default/resources/Makefile", + summary_dict={"name": "Makefile", "summary": "VLM generated build file summary"}, + parent_uri="viking://user/default/resources", + ctx=DummyReq(), + ) + + assert len(queue.items) == 1 + msg = queue.items[0] + assert msg.message == "VLM generated build file summary" + assert msg.context_data["content"] == raw_content + assert fs.read_file_bytes_calls == 1 + assert fs.read_file_calls == 0 + + +@pytest.mark.asyncio +async def test_vectorize_text_summary_first_reuses_single_file_read(monkeypatch): + queue = DummyQueue() + fs = DummyFS("# README\nraw text for bm25\n") + monkeypatch.setattr(embedding_utils, "get_queue_manager", lambda: DummyQueueManager(queue)) + monkeypatch.setattr(embedding_utils, "get_viking_fs", lambda: fs) + monkeypatch.setattr( + embedding_utils, + "get_openviking_config", + lambda: types.SimpleNamespace( + embedding=types.SimpleNamespace(text_source="summary_first", max_input_tokens=1000) + ), + ) + + await embedding_utils.vectorize_file( + file_path="viking://user/default/resources/README.md", + summary_dict={"name": "README.md", "summary": "summary for embedding"}, + parent_uri="viking://user/default/resources", + ctx=DummyReq(), + ) + + assert len(queue.items) == 1 + msg = queue.items[0] + assert msg.message == "summary for embedding" + assert msg.context_data["content"] == "# README\nraw text for bm25\n" + assert fs.read_file_calls == 1 + assert fs.read_file_bytes_calls == 0 + + @pytest.mark.asyncio async def test_vectorize_file_preserves_content_until_embedder_input_guard(monkeypatch): queue = DummyQueue() From 4b8481ad9568fd9e677889acff9451c56bd3a64a Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Thu, 11 Jun 2026 16:22:27 +0800 Subject: [PATCH 22/31] optimize: fulltext analyzer's stop-words only use symbols --- openviking/storage/collection_schemas.py | 5 ++++- tests/storage/test_rebuild_schema.py | 9 ++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/openviking/storage/collection_schemas.py b/openviking/storage/collection_schemas.py index 57475a6db2..e35b464997 100644 --- a/openviking/storage/collection_schemas.py +++ b/openviking/storage/collection_schemas.py @@ -156,7 +156,10 @@ def context_collection( "Fields": fields, "ScalarIndex": scalar_index, "FullText": [ - {"Field": "content", "Analyzer": {"Tokenizer": "standard"}}, + { + "Field": "content", + "Analyzer": {"Tokenizer": "standard", "StopWordsFilters": ["symbol"]}, + }, ], } diff --git a/tests/storage/test_rebuild_schema.py b/tests/storage/test_rebuild_schema.py index d170507057..aee7a157cf 100644 --- a/tests/storage/test_rebuild_schema.py +++ b/tests/storage/test_rebuild_schema.py @@ -11,5 +11,12 @@ def test_context_collection_contains_content_field_for_fulltext(): # embedding_content is not a schema field assert "embedding_content" not in field_names # FullText config must reference the content field - fulltext_fields = [ft["Field"] for ft in schema.get("FullText", [])] + fulltext_cfg = schema.get("FullText", []) + fulltext_fields = [ft["Field"] for ft in fulltext_cfg] assert "content" in fulltext_fields + + # Analyzer config must include tokenizer + stopwords filter + content_cfg = next(ft for ft in fulltext_cfg if ft.get("Field") == "content") + analyzer = content_cfg.get("Analyzer") or {} + assert analyzer.get("Tokenizer") == "standard" + assert analyzer.get("StopWordsFilters") == ["symbol"] From 3ae5dbff2ed53d078502e148a17c4ffe01aeed41 Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Fri, 12 Jun 2026 14:24:31 +0800 Subject: [PATCH 23/31] fix: adapt to new ov cli for benchmark --- .../retrieval/grep/vikingdb_bm25/effectiveness/step2_quality.py | 2 +- .../retrieval/grep/vikingdb_bm25/performance/step2_reindex.py | 2 +- .../retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step2_quality.py b/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step2_quality.py index c4da2ceb16..55237517ca 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step2_quality.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/effectiveness/step2_quality.py @@ -208,7 +208,7 @@ def main(): print(" Subsequent runs can use any engine; cached ground truth is reused.") print() - client = SyncHTTPClient(account="default", user="default") + client = SyncHTTPClient() client.initialize() # Phase 1: Compute ground truth (needs fs engine on first run) diff --git a/benchmark/retrieval/grep/vikingdb_bm25/performance/step2_reindex.py b/benchmark/retrieval/grep/vikingdb_bm25/performance/step2_reindex.py index 1caff8161f..41df30d3be 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/performance/step2_reindex.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/performance/step2_reindex.py @@ -105,7 +105,7 @@ def main(): print(f" Resuming: {len(already_done)} directories already reindexed") print() - client = SyncHTTPClient(account="default", user="default") + client = SyncHTTPClient() client.initialize() # Build work queue (skip already completed) diff --git a/benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py b/benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py index ec06b0657b..db98c088a3 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py @@ -338,7 +338,7 @@ def main(): total_files = count_local_files() - client = SyncHTTPClient(account="default", user="default") + client = SyncHTTPClient() client.initialize() print("Computing ground truth (OV grep, fs engine)...") From 211d04bc48ff0db97e2ab29914286ec71321bcd9 Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Fri, 12 Jun 2026 15:34:37 +0800 Subject: [PATCH 24/31] optimize: reuse file content to avoid re-read AGFS file --- .../storage/queuefs/semantic_processor.py | 31 ++-- openviking/utils/embedding_utils.py | 62 ++++++- .../test_semantic_processor_language.py | 76 ++++---- tests/unit/test_vectorize_file_strategy.py | 169 ++++++++++++++++++ 4 files changed, 285 insertions(+), 53 deletions(-) diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py index fa3a20e993..7e43578d88 100644 --- a/openviking/storage/queuefs/semantic_processor.py +++ b/openviking/storage/queuefs/semantic_processor.py @@ -1005,7 +1005,7 @@ async def _generate_text_summary( file_name: str, llm_sem: asyncio.Semaphore, ctx: Optional[RequestContext] = None, - ) -> Dict[str, str]: + ) -> Dict[str, Any]: """Generate summary for a single text file (code, documentation, or other text).""" viking_fs = get_viking_fs() vlm = get_openviking_config().vlm @@ -1013,12 +1013,14 @@ async def _generate_text_summary( content = await viking_fs.read_file(file_path, ctx=active_ctx) if isinstance(content, bytes): - # Try to decode with error handling for text files - try: - content = content.decode("utf-8") - except UnicodeDecodeError: - logger.warning(f"Failed to decode file as UTF-8, skipping: {file_path}") - return {"name": file_name, "summary": ""} + from openviking.utils.embedding_utils import _decode_text_bytes + + content = _decode_text_bytes(content) + + full_content = content or "" + + def result(summary: str) -> Dict[str, Any]: + return {"name": file_name, "summary": summary, "content": full_content} # Limit content length max_chars = get_openviking_config().semantic.max_file_content_chars @@ -1028,7 +1030,7 @@ async def _generate_text_summary( # Generate summary if not vlm.is_available(): logger.warning("VLM not available, using empty summary") - return {"name": file_name, "summary": ""} + return result("") from openviking.session.memory.utils.language import resolve_output_language @@ -1050,7 +1052,7 @@ async def _generate_text_summary( if len(skeleton_text) > max_skeleton_chars: skeleton_text = skeleton_text[:max_skeleton_chars] if code_mode == "ast": - return {"name": file_name, "summary": skeleton_text} + return result(skeleton_text) else: # ast_llm prompt = render_prompt( "semantic.code_ast_summary", @@ -1063,7 +1065,7 @@ async def _generate_text_summary( async with llm_sem: with bind_telemetry_stage("resource_summarize"): summary = await vlm.get_completion_async(prompt) - return {"name": file_name, "summary": summary.strip()} + return result(summary.strip()) if skeleton_text is None: logger.info("AST unsupported language, fallback to LLM: %s", file_path) else: @@ -1077,7 +1079,7 @@ async def _generate_text_summary( async with llm_sem: with bind_telemetry_stage("resource_summarize"): summary = await vlm.get_completion_async(prompt) - return {"name": file_name, "summary": summary.strip()} + return result(summary.strip()) elif file_type == FILE_TYPE_DOCUMENTATION: prompt_id = "semantic.document_summary" @@ -1092,21 +1094,22 @@ async def _generate_text_summary( async with llm_sem: with bind_telemetry_stage("resource_summarize"): summary = await vlm.get_completion_async(prompt) - return {"name": file_name, "summary": summary.strip()} + return result(summary.strip()) async def _generate_single_file_summary( self, file_path: str, llm_sem: Optional[asyncio.Semaphore] = None, ctx: Optional[RequestContext] = None, - ) -> Dict[str, str]: + ) -> Dict[str, Any]: """Generate summary for a single file. Args: file_path: File path Returns: - {"name": file_name, "summary": summary_content} + {"name": file_name, "summary": summary_content}; text files also carry + decoded "content" so vectorization can avoid re-reading the same file. """ file_name = file_path.split("/")[-1] llm_sem = llm_sem or asyncio.Semaphore(self.max_concurrent_llm) diff --git a/openviking/utils/embedding_utils.py b/openviking/utils/embedding_utils.py index 2aaefb4ebb..f21b998639 100644 --- a/openviking/utils/embedding_utils.py +++ b/openviking/utils/embedding_utils.py @@ -236,18 +236,50 @@ async def _build_image_data_uri( def _coerce_text_file_content(raw: Any) -> str: """Coerce known text-file content returned by VikingFS into str.""" if isinstance(raw, bytes): - return raw.decode("utf-8", errors="replace") + return _decode_text_bytes(raw) return raw or "" -def _decode_unknown_file_bytes(raw: bytes) -> str: - """Decode unknown file bytes with charset sniffing for BM25 content.""" +def _looks_like_binary_bytes(raw: bytes) -> bool: + """Conservative binary check for unknown file bytes.""" + if not raw: + return False + if b"\x00" in raw[:4096]: + return True + + allowed_controls = {9, 10, 12, 13} + sample = raw[:4096] + control_count = sum(byte < 32 and byte not in allowed_controls for byte in sample) + return control_count / len(sample) > 0.3 + + +def _decode_text_bytes(raw: bytes) -> str: + """Decode file bytes for BM25 content. + + Prefer UTF-8. If UTF-8 fails, reject binary-looking bytes, then try charset + sniffing. Return an empty string when no text encoding can be recognized. + """ if not raw: return "" + + try: + return raw.decode("utf-8") + except UnicodeDecodeError: + pass + + if _looks_like_binary_bytes(raw): + return "" + best = from_bytes(raw).best() - if best is not None: - return str(best) - return raw.decode("utf-8", errors="replace") + if best is None: + return "" + + return str(best) + + +def _decode_unknown_file_bytes(raw: bytes) -> str: + """Decode unknown file bytes with the shared text-byte decoding strategy.""" + return _decode_text_bytes(raw) async def _read_unknown_file_text_for_fulltext( @@ -374,7 +406,7 @@ async def vectorize_directory_meta( async def vectorize_file( file_path: str, - summary_dict: Dict[str, str], + summary_dict: Dict[str, Any], parent_uri: str, context_type: str = "resource", ctx: Optional[RequestContext] = None, @@ -403,6 +435,10 @@ async def vectorize_file( file_name = summary_dict.get("name") or os.path.basename(file_path) summary = summary_dict.get("summary", "") + has_reusable_content = "content" in summary_dict + reusable_content = ( + _coerce_text_file_content(summary_dict.get("content")) if has_reusable_content else "" + ) created_at, updated_at = await _resolve_context_timestamps( file_path, @@ -435,7 +471,11 @@ async def vectorize_file( logger.warning( f"Unsupported file type for {file_path}, falling back to summary for vectorization" ) - full_content = await _read_unknown_file_text_for_fulltext(file_path, viking_fs, ctx) + full_content = ( + reusable_content + if has_reusable_content + else await _read_unknown_file_text_for_fulltext(file_path, viking_fs, ctx) + ) context.set_vectorize(Vectorize(text=summary, full_text=full_content or summary)) else: logger.warning( @@ -446,7 +486,11 @@ async def vectorize_file( # Known text files use VikingFS' text read path once, then reuse that # content for BM25 regardless of whether embedding uses summary or raw text. try: - content = _coerce_text_file_content(await viking_fs.read_file(file_path, ctx=ctx)) + content = ( + reusable_content + if has_reusable_content + else _coerce_text_file_content(await viking_fs.read_file(file_path, ctx=ctx)) + ) except Exception as e: logger.warning( f"Failed to read file content for {file_path}, falling back to summary: {e}" diff --git a/tests/storage/test_semantic_processor_language.py b/tests/storage/test_semantic_processor_language.py index fd3b00f918..d06909f973 100644 --- a/tests/storage/test_semantic_processor_language.py +++ b/tests/storage/test_semantic_processor_language.py @@ -123,10 +123,7 @@ def test_detect_language_english_with_single_korean_char(self): assert language == "en" def test_detect_language_italian(self): - text = ( - "Questo documento descrive le preferenze dell utente " - "e il progetto da completare." - ) + text = "Questo documento descrive le preferenze dell utente e il progetto da completare." language = _detect_language_from_text(text, fallback_language="it") assert language == "it" @@ -143,9 +140,15 @@ def test_strong_italian_text_can_override_system_fallback(self): [ ("project document user data model profile", "en"), ("Ce document décrit les préférences de l utilisateur et le projet à terminer.", "fr"), - ("Este documento describe las preferencias del usuario y el proyecto para completar.", "es"), + ( + "Este documento describe las preferencias del usuario y el proyecto para completar.", + "es", + ), ("Dieses Dokument beschreibt die Präferenzen der Benutzer und das Projekt.", "de"), - ("Este documento descreve as preferências do usuário e o projeto para completar.", "pt"), + ( + "Este documento descreve as preferências do usuário e o projeto para completar.", + "pt", + ), ], ) def test_detect_latin_language_conservatively(self, text, expected): @@ -357,15 +360,19 @@ async def test_e2e_code_output_language( mock_viking_fs = self._create_mock_viking_fs(content) mock_config = self._create_mock_config(mock_vlm) - with patch.dict( - os.environ, - {"LC_ALL": self._LANGUAGE_LOCALE[expected_lang]}, - ), patch( - "openviking.storage.queuefs.semantic_processor.get_viking_fs", - return_value=mock_viking_fs, - ), patch( - "openviking.storage.queuefs.semantic_processor.get_openviking_config", - return_value=mock_config, + with ( + patch.dict( + os.environ, + {"LC_ALL": self._LANGUAGE_LOCALE[expected_lang]}, + ), + patch( + "openviking.storage.queuefs.semantic_processor.get_viking_fs", + return_value=mock_viking_fs, + ), + patch( + "openviking.storage.queuefs.semantic_processor.get_openviking_config", + return_value=mock_config, + ), ): processor = SemanticProcessor() processor._current_ctx = MagicMock() @@ -384,6 +391,7 @@ async def test_e2e_code_output_language( assert _verify_content_language(result["summary"], expected_lang), ( f"{file_name}: Content language mismatch. Expected {expected_lang}, got: {result['summary']}" ) + assert result["content"] == content @pytest.mark.asyncio @pytest.mark.parametrize( @@ -401,15 +409,19 @@ async def test_e2e_russian_arabic_output_language(self, content, file_name, expe mock_viking_fs = self._create_mock_viking_fs(content) mock_config = self._create_mock_config(mock_vlm) - with patch.dict( - os.environ, - {"LC_ALL": self._LANGUAGE_LOCALE[expected_lang]}, - ), patch( - "openviking.storage.queuefs.semantic_processor.get_viking_fs", - return_value=mock_viking_fs, - ), patch( - "openviking.storage.queuefs.semantic_processor.get_openviking_config", - return_value=mock_config, + with ( + patch.dict( + os.environ, + {"LC_ALL": self._LANGUAGE_LOCALE[expected_lang]}, + ), + patch( + "openviking.storage.queuefs.semantic_processor.get_viking_fs", + return_value=mock_viking_fs, + ), + patch( + "openviking.storage.queuefs.semantic_processor.get_openviking_config", + return_value=mock_config, + ), ): processor = SemanticProcessor() processor._current_ctx = MagicMock() @@ -517,12 +529,16 @@ def test_non_english_locale_hint_wins_over_timezone(self): assert _resolve_system_fallback_language("en") == "ja" def test_local_timezone_hint_used_when_tz_env_absent(self): - with patch.dict(os.environ, {}, clear=True), patch( - "openviking.session.memory.utils.language.locale.getlocale", - return_value=("C", "UTF-8"), - ), patch( - "openviking.session.memory.utils.language.os.path.realpath", - return_value="/usr/share/zoneinfo.default/Asia/Shanghai", + with ( + patch.dict(os.environ, {}, clear=True), + patch( + "openviking.session.memory.utils.language.locale.getlocale", + return_value=("C", "UTF-8"), + ), + patch( + "openviking.session.memory.utils.language.os.path.realpath", + return_value="/usr/share/zoneinfo.default/Asia/Shanghai", + ), ): assert _resolve_system_fallback_language("en") == "zh-CN" diff --git a/tests/unit/test_vectorize_file_strategy.py b/tests/unit/test_vectorize_file_strategy.py index 885f139776..4613b6e202 100644 --- a/tests/unit/test_vectorize_file_strategy.py +++ b/tests/unit/test_vectorize_file_strategy.py @@ -156,6 +156,106 @@ async def test_vectorize_unknown_text_file_sniffs_non_utf8_raw_content(monkeypat assert fs.read_file_calls == 0 +@pytest.mark.asyncio +async def test_vectorize_unknown_file_reuses_summary_content_without_reread(monkeypatch): + queue = DummyQueue() + raw_content = "build:\n\tcargo build --locked\n" + fs = DummyFS("should not be read") + monkeypatch.setattr(embedding_utils, "get_queue_manager", lambda: DummyQueueManager(queue)) + monkeypatch.setattr(embedding_utils, "get_viking_fs", lambda: fs) + monkeypatch.setattr( + embedding_utils, + "get_openviking_config", + lambda: types.SimpleNamespace( + embedding=types.SimpleNamespace(text_source="summary_first", max_input_tokens=1000) + ), + ) + + await embedding_utils.vectorize_file( + file_path="viking://user/default/resources/Makefile", + summary_dict={ + "name": "Makefile", + "summary": "VLM generated build file summary", + "content": raw_content, + }, + parent_uri="viking://user/default/resources", + ctx=DummyReq(), + ) + + assert len(queue.items) == 1 + msg = queue.items[0] + assert msg.message == "VLM generated build file summary" + assert msg.context_data["content"] == raw_content + assert fs.read_file_bytes_calls == 0 + assert fs.read_file_calls == 0 + + +@pytest.mark.asyncio +async def test_vectorize_unknown_binary_file_falls_back_to_summary(monkeypatch): + queue = DummyQueue() + summary = "VLM generated binary file summary" + binary_content = b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01" + fs = DummyFS(binary_content) + monkeypatch.setattr(embedding_utils, "get_queue_manager", lambda: DummyQueueManager(queue)) + monkeypatch.setattr(embedding_utils, "get_viking_fs", lambda: fs) + monkeypatch.setattr( + embedding_utils, + "get_openviking_config", + lambda: types.SimpleNamespace( + embedding=types.SimpleNamespace(text_source="summary_first", max_input_tokens=1000) + ), + ) + + await embedding_utils.vectorize_file( + file_path="viking://user/default/resources/model.weights", + summary_dict={"name": "model.weights", "summary": summary}, + parent_uri="viking://user/default/resources", + ctx=DummyReq(), + ) + + assert len(queue.items) == 1 + msg = queue.items[0] + assert msg.message == summary + assert msg.context_data["content"] == summary + assert fs.read_file_bytes_calls == 1 + assert fs.read_file_calls == 0 + + +@pytest.mark.asyncio +async def test_vectorize_unknown_unrecognizable_encoding_falls_back_to_summary(monkeypatch): + queue = DummyQueue() + summary = "VLM generated unknown file summary" + fs = DummyFS(b"\xff\xfe\xfd") + monkeypatch.setattr(embedding_utils, "get_queue_manager", lambda: DummyQueueManager(queue)) + monkeypatch.setattr(embedding_utils, "get_viking_fs", lambda: fs) + monkeypatch.setattr( + embedding_utils, + "get_openviking_config", + lambda: types.SimpleNamespace( + embedding=types.SimpleNamespace(text_source="summary_first", max_input_tokens=1000) + ), + ) + monkeypatch.setattr( + embedding_utils, + "from_bytes", + lambda _raw: types.SimpleNamespace(best=lambda: None), + ) + + await embedding_utils.vectorize_file( + file_path="viking://user/default/resources/unknown.data", + summary_dict={"name": "unknown.data", "summary": summary}, + parent_uri="viking://user/default/resources", + ctx=DummyReq(), + ) + + assert len(queue.items) == 1 + msg = queue.items[0] + assert msg.message == summary + assert msg.context_data["content"] == summary + assert fs.read_file_bytes_calls == 1 + assert fs.read_file_calls == 0 + + @pytest.mark.asyncio async def test_vectorize_text_summary_first_reuses_single_file_read(monkeypatch): queue = DummyQueue() @@ -185,6 +285,75 @@ async def test_vectorize_text_summary_first_reuses_single_file_read(monkeypatch) assert fs.read_file_bytes_calls == 0 +@pytest.mark.asyncio +async def test_vectorize_text_file_reuses_summary_content_without_reread(monkeypatch): + queue = DummyQueue() + raw_content = "# README\nraw text already read during summary\n" + fs = DummyFS("should not be read") + monkeypatch.setattr(embedding_utils, "get_queue_manager", lambda: DummyQueueManager(queue)) + monkeypatch.setattr(embedding_utils, "get_viking_fs", lambda: fs) + monkeypatch.setattr( + embedding_utils, + "get_openviking_config", + lambda: types.SimpleNamespace( + embedding=types.SimpleNamespace(text_source="summary_first", max_input_tokens=1000) + ), + ) + + await embedding_utils.vectorize_file( + file_path="viking://user/default/resources/README.md", + summary_dict={ + "name": "README.md", + "summary": "summary for embedding", + "content": raw_content, + }, + parent_uri="viking://user/default/resources", + ctx=DummyReq(), + ) + + assert len(queue.items) == 1 + msg = queue.items[0] + assert msg.message == "summary for embedding" + assert msg.context_data["content"] == raw_content + assert fs.read_file_calls == 0 + assert fs.read_file_bytes_calls == 0 + + +@pytest.mark.asyncio +async def test_vectorize_text_bytes_sniffs_non_utf8_content(monkeypatch): + queue = DummyQueue() + raw_content = ( + "# 说明文档\n" + "目标: 验证已知 TEXT 文件的 bytes 内容也会进行编码探测。\n" + "说明: 这是一个中文 README 内容,用于测试 GB18030 编码识别。\n" + "命令: openviking benchmark run\n" + ) + fs = DummyFS(raw_content.encode("gb18030")) + monkeypatch.setattr(embedding_utils, "get_queue_manager", lambda: DummyQueueManager(queue)) + monkeypatch.setattr(embedding_utils, "get_viking_fs", lambda: fs) + monkeypatch.setattr( + embedding_utils, + "get_openviking_config", + lambda: types.SimpleNamespace( + embedding=types.SimpleNamespace(text_source="summary_first", max_input_tokens=1000) + ), + ) + + await embedding_utils.vectorize_file( + file_path="viking://user/default/resources/README.md", + summary_dict={"name": "README.md", "summary": "summary for embedding"}, + parent_uri="viking://user/default/resources", + ctx=DummyReq(), + ) + + assert len(queue.items) == 1 + msg = queue.items[0] + assert msg.message == "summary for embedding" + assert msg.context_data["content"] == raw_content + assert fs.read_file_calls == 1 + assert fs.read_file_bytes_calls == 0 + + @pytest.mark.asyncio async def test_vectorize_file_preserves_content_until_embedder_input_guard(monkeypatch): queue = DummyQueue() From b1322490ea7a3a7b044a8d436424237d3bd52416 Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Mon, 15 Jun 2026 16:43:15 +0800 Subject: [PATCH 25/31] optimize: tune grep vikingdb defaults and refresh bm25 benchmark scripts --- .../retrieval/grep/vikingdb_bm25/README.md | 29 +-- .../retrieval/grep/vikingdb_bm25/README_CN.md | 30 +-- .../performance/step0_prepare_data.py | 70 ++++--- .../performance/step3_benchmark.py | 195 +++--------------- docs/en/guides/01-configuration.md | 6 +- docs/zh/guides/01-configuration.md | 6 +- openviking/storage/viking_fs.py | 9 +- openviking_cli/utils/config/grep_config.py | 2 +- tests/storage/test_viking_fs_grep.py | 45 ++++ 9 files changed, 164 insertions(+), 228 deletions(-) diff --git a/benchmark/retrieval/grep/vikingdb_bm25/README.md b/benchmark/retrieval/grep/vikingdb_bm25/README.md index cf033c616c..f845b68f71 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/README.md +++ b/benchmark/retrieval/grep/vikingdb_bm25/README.md @@ -49,9 +49,9 @@ python3 step2_quality.py --keywords grep reindex SyncHTTPClient # Optional: --regenerate-ground-truth (force recompute, requires engine=fs) ``` -## Performance — Latency & Recall at Scale +## Performance — Latency at Scale -Tests grep speed and recall on a large synthetic dataset (default: 100K files). +Tests grep speed and returned match count on a large synthetic dataset (default: 200K files). **Data source:** Generated from `ai_wiki.txt` with target words injected at known probabilities. @@ -60,27 +60,32 @@ Tests grep speed and recall on a large synthetic dataset (default: 100K files). | 0 | `step0_prepare_data.py` | Generate synthetic dataset (dir_xxx/wiki_xxx.txt) | | 1 | `step1_add_resource.py` | Import data (no VLM/embedding, fast) | | 2 | `step2_reindex.py` | Async reindex via openviking-server (concurrency=16, polling) | -| 3 | `step3_benchmark.py` | Measure latency and recall (ground truth from fs engine, cached) | +| 3 | `step3_benchmark.py` | Measure latency and returned match count with `node_limit=256` | ### Target Words -12 words across 4 probability tiers: +15 words across 5 probability tiers: | Probability | Words | Expected hits (per 100K files) | |-------------|-------|-------------------------------| | 50% | quantumnexus, synapseflow, deepvector | ~50,000 | -| 10% | bm25engine, vikingcore, retrievex | ~10,000 | -| 0.1% | zephyrhash, cryptolattice, nebulalink | ~100 | -| 0.01% | xenoform, quarkpulse, omegabind | ~10 | +| 1% | heliofract, prismcache, fluxkernel | ~2,000 | +| 0.1% | auroracode, kiteshade, glyphvector | ~200 | +| 0.1% | cortexmint, latticewave, spiralsync | ~200 | +| 0.05% | ripplehash, embertrace, novaframe | ~100 | +| 0.01% | zephyrloom, quartzrelay, nebulaindex | ~20 | ### Usage ```bash cd performance/ -# Step 0: Generate data (default: 100 dirs x 1000 files = 100K files) +# Step 0: Generate data (default: 200 dirs x 1000 files = 200K files) python3 step0_prepare_data.py +# Optional: append more data for scale-out without overwriting existing dirs +python3 step0_prepare_data.py --start-dir 100 --num-dirs 100 + # Step 1: Import without indexing (fast) python3 step1_add_resource.py @@ -89,7 +94,7 @@ python3 step2_reindex.py # Optional: --concurrency N (default: 16) # Step 3: Benchmark — run with different engine configs -# Run A: fs engine (also generates ground truth cache on first run) +# Run A: fs engine # 1. Set ov.conf: "grep": {"engine": "fs"} # 2. Restart server python3 step3_benchmark.py --engine-label fs @@ -103,9 +108,9 @@ python3 step3_benchmark.py --engine-label auto --compare step3_result_fs.json ## Key Concepts - **Effectiveness** tests compare grep results against ground truth from fs-engine grep (cached locally) -- **Performance** tests compare grep latency and match counts between engine configs (ground truth from fs-engine grep, cached) -- Both ground truth caches are stored in `~/.openviking/data/benchmark/.ground_truth/` -- First run of each step3 MUST use `engine=fs` in ov.conf to generate ground truth; subsequent runs can use any engine +- **Performance** tests compare grep latency and returned match counts between engine configs; no ground truth is generated - Both follow the same workflow: import (with indexing) → benchmark/evaluate - Both support **resumable** execution via progress files (separate for import and reindex) - Change grep engine via `ov.conf` and restart the server between benchmark runs +- To horizontally scale the synthetic dataset, run Step 0 again with a new `--start-dir`, + then rerun Step 1 and Step 2. diff --git a/benchmark/retrieval/grep/vikingdb_bm25/README_CN.md b/benchmark/retrieval/grep/vikingdb_bm25/README_CN.md index 3801855063..351550741c 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/README_CN.md +++ b/benchmark/retrieval/grep/vikingdb_bm25/README_CN.md @@ -49,9 +49,9 @@ python3 step2_quality.py # 可选参数:--regenerate-ground-truth (强制重算,需 engine=fs) ``` -## Performance — 检索性能 +## Performance — 检索延迟 -在大规模合成数据集(默认 10 万文件)上测试 grep 速度和召回率。 +在大规模合成数据集(默认 20 万文件)上测试 grep 速度和返回匹配数。 **数据来源:** 从 `ai_wiki.txt` 生成,按已知概率注入目标单词。 @@ -60,27 +60,32 @@ python3 step2_quality.py | 0 | `step0_prepare_data.py` | 生成合成数据集(dir_xxx/wiki_xxx.txt) | | 1 | `step1_add_resource.py` | 导入数据(不建索引,速度快) | | 2 | `step2_reindex.py` | 通过 openviking-server 异步构建索引(并发=16,轮询) | -| 3 | `step3_benchmark.py` | 测量延迟和召回率(ground truth 来自 fs 引擎,缓存) | +| 3 | `step3_benchmark.py` | 使用 `node_limit=256` 测量延迟和返回匹配数 | ### 目标单词 -12 个单词,分 4 个概率层级: +15 个单词,分 5 个概率层级: -| 概率 | 单词 | 预期命中数(每 10 万文件) | +| 概率 | 单词 | 预期命中数(每 20 万文件) | |------|------|---------------------------| | 50% | quantumnexus, synapseflow, deepvector | ~50,000 | -| 10% | bm25engine, vikingcore, retrievex | ~10,000 | -| 0.1% | zephyrhash, cryptolattice, nebulalink | ~100 | -| 0.01% | xenoform, quarkpulse, omegabind | ~10 | +| 1% | heliofract, prismcache, fluxkernel | ~2,000 | +| 0.1% | auroracode, kiteshade, glyphvector | ~200 | +| 0.1% | cortexmint, latticewave, spiralsync | ~200 | +| 0.05% | ripplehash, embertrace, novaframe | ~100 | +| 0.01% | zephyrloom, quartzrelay, nebulaindex | ~20 | ### 使用方法 ```bash cd performance/ -# 步骤 0:生成数据(默认:100 目录 x 1000 文件 = 10 万文件) +# 步骤 0:生成数据(默认:200 目录 x 1000 文件 = 20 万文件) python3 step0_prepare_data.py +# 可选:追加更多数据,用于水平扩容,不覆盖已有目录 +python3 step0_prepare_data.py --start-dir 100 --num-dirs 100 + # 步骤 1:导入数据(不建索引,速度快) python3 step1_add_resource.py @@ -89,7 +94,7 @@ python3 step2_reindex.py # 可选参数:--concurrency N (默认:16) # 步骤 3:基准测试 — 用不同引擎配置各跑一次 -# 运行 A:fs 引擎(首次运行同时生成 ground truth 缓存) +# 运行 A:fs 引擎 # 1. 设置 ov.conf: "grep": {"engine": "fs"} # 2. 重启服务 python3 step3_benchmark.py --engine-label fs @@ -103,9 +108,8 @@ python3 step3_benchmark.py --engine-label auto --compare step3_result_fs.json ## 核心概念 - **Effectiveness(效果测试)** 将 grep 结果与 fs 引擎的 ground truth 对比(本地缓存) -- **Performance(性能测试)** 对比不同引擎的延迟和匹配数(ground truth 来自 fs 引擎,本地缓存) -- 两个 ground truth 缓存均存储在 `~/.openviking/data/benchmark/.ground_truth/` -- 每个 step3 首次运行必须使用 ov.conf 的 `engine=fs` 来生成 ground truth;后续运行可使用任意引擎 +- **Performance(性能测试)** 对比不同引擎的延迟和返回匹配数,不生成 ground truth - 两者遵循相同流程:导入(含建索引)→ 评估/测试 - 两者均支持**断点续传**(导入和索引各有独立进度文件) - 切换 grep 引擎需修改 `ov.conf` 并重启服务,在不同运行之间对比 +- 如需水平扩展合成数据集,可用新的 `--start-dir` 再运行步骤 0,然后重跑步骤 1 和步骤 2。 diff --git a/benchmark/retrieval/grep/vikingdb_bm25/performance/step0_prepare_data.py b/benchmark/retrieval/grep/vikingdb_bm25/performance/step0_prepare_data.py index b8118278ae..36b72a548d 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/performance/step0_prepare_data.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/performance/step0_prepare_data.py @@ -10,20 +10,22 @@ /dir_001/wiki_000.txt ... dir_001/wiki_999.txt ... -Each dir_xxx contains 1000 files. Default: 100 directories (100,000 files). +Each dir_xxx contains 1000 files. Default: 200 directories (200,000 files). Target words are injected by replacing a random word in the text. All target words must NOT exist in the original source text. -Default target words and probabilities (12 words, 4 tiers): - 50% : quantumnexus, synapseflow, deepvector - 10% : bm25engine, vikingcore, retrievex - 0.1% : zephyrhash, cryptolattice, nebulalink - 0.01% : xenoform, quarkpulse, omegabind +Default target words and probabilities (15 words, 5 tiers): + 1% : heliofract, prismcache, fluxkernel + 0.1% : auroracode, kiteshade, glyphvector + 0.1% : cortexmint, latticewave, spiralsync + 0.05% : ripplehash, embertrace, novaframe + 0.01% : zephyrloom, quartzrelay, nebulaindex Usage: python3 step0_prepare_data.py python3 step0_prepare_data.py --num-dirs 50 --seed 42 + python3 step0_prepare_data.py --start-dir 100 --num-dirs 100 # append dir_100..dir_199 """ from __future__ import annotations @@ -39,19 +41,20 @@ DEFAULT_OUTPUT = os.path.expanduser("~/.openviking/data/benchmark/synthetic") FILES_PER_DIR = 1000 -TARGET_WORDS = { - 0.50: ["quantumnexus", "synapseflow", "deepvector"], - 0.10: ["bm25engine", "vikingcore", "retrievex"], - 0.001: ["zephyrhash", "cryptolattice", "nebulalink"], - 0.0001: ["xenoform", "quarkpulse", "omegabind"], -} +TARGET_GROUPS: list[tuple[float, list[str]]] = [ + (0.01, ["heliofract", "prismcache", "fluxkernel"]), + (0.001, ["auroracode", "kiteshade", "glyphvector"]), + (0.001, ["cortexmint", "latticewave", "spiralsync"]), + (0.0005, ["ripplehash", "embertrace", "novaframe"]), + (0.0001, ["zephyrloom", "quartzrelay", "nebulaindex"]), +] def verify_target_words(text: str) -> None: """Verify that no target word appears in the source text.""" text_lower = text.lower() conflicts = [] - for prob, words in TARGET_WORDS.items(): + for prob, words in TARGET_GROUPS: for word in words: if word.lower() in text_lower: conflicts.append((word, prob)) @@ -77,19 +80,23 @@ def generate_dataset( source_text: str, output_dir: str, num_dirs: int, + start_dir: int = 0, seed: int = 42, ) -> dict: - rng = random.Random(seed) + rng = random.Random(seed + start_dir) total_files = num_dirs * FILES_PER_DIR - injection_stats = {word: 0 for prob_words in TARGET_WORDS.values() for word in prob_words} + injection_stats = {word: 0 for _prob, words in TARGET_GROUPS for word in words} + end_dir = start_dir + num_dirs - 1 if num_dirs > 0 else start_dir print(f" Generating {num_dirs} dirs x {FILES_PER_DIR} files = {total_files} files") + print(f" Directory range: dir_{start_dir:03d} .. dir_{end_dir:03d}") print(f" Output: {output_dir}") print() t0 = time.monotonic() - for dir_idx in range(num_dirs): + for offset in range(num_dirs): + dir_idx = start_dir + offset dir_name = f"dir_{dir_idx:03d}" dir_path = os.path.join(output_dir, dir_name) os.makedirs(dir_path, exist_ok=True) @@ -99,7 +106,7 @@ def generate_dataset( file_path = os.path.join(dir_path, file_name) content = source_text - for prob, words in TARGET_WORDS.items(): + for prob, words in TARGET_GROUPS: for word in words: if rng.random() < prob: content = inject_word(content, word) @@ -108,13 +115,14 @@ def generate_dataset( with open(file_path, "w") as f: f.write(content) - if (dir_idx + 1) % 10 == 0 or dir_idx == num_dirs - 1: + if (offset + 1) % 10 == 0 or offset == num_dirs - 1: elapsed = time.monotonic() - t0 - print(f" [{dir_idx + 1}/{num_dirs}] dirs created ({elapsed:.1f}s)") + print(f" [{offset + 1}/{num_dirs}] dirs created ({elapsed:.1f}s)") elapsed = time.monotonic() - t0 return { "total_files": total_files, + "start_dir": start_dir, "num_dirs": num_dirs, "files_per_dir": FILES_PER_DIR, "elapsed_s": round(elapsed, 1), @@ -133,7 +141,13 @@ def main(): "--output", default=DEFAULT_OUTPUT, help=f"Output directory (default: {DEFAULT_OUTPUT})" ) parser.add_argument( - "--num-dirs", type=int, default=100, help="Number of directories (default: 100)" + "--num-dirs", type=int, default=200, help="Number of directories (default: 200)" + ) + parser.add_argument( + "--start-dir", + type=int, + default=0, + help="Starting directory index for append/scale-out runs (default: 0)", ) parser.add_argument("--seed", type=int, default=42, help="Random seed (default: 42)") args = parser.parse_args() @@ -146,6 +160,7 @@ def main(): print("=" * 80) print(f" Source: {source}") print(f" Output: {output}") + print(f" StartDir: {args.start_dir}") print(f" Dirs: {args.num_dirs}") print(f" Seed: {args.seed}") print() @@ -168,25 +183,26 @@ def main(): print(" All target words verified OK.") print() print(" Target words and injection probabilities:") - for prob in sorted(TARGET_WORDS.keys(), reverse=True): - words = TARGET_WORDS[prob] - pct = f"{prob * 100:.2f}%" + for prob, words in sorted(TARGET_GROUPS, key=lambda item: item[0], reverse=True): + pct = f"{prob * 100:.3f}%" print(f" {pct:>8s} : {', '.join(words)}") print() - summary = generate_dataset(source_text, output, args.num_dirs, seed=args.seed) + summary = generate_dataset( + source_text, output, args.num_dirs, start_dir=args.start_dir, seed=args.seed + ) print() print("=" * 80) print("Summary:") print(f" Total files: {summary['total_files']:,}") + print(f" Start dir: {summary['start_dir']}") print(f" Directories: {summary['num_dirs']}") print(f" Elapsed: {summary['elapsed_s']}s") print() print(" Target word injection counts:") total_files = summary["total_files"] - for prob in sorted(TARGET_WORDS.keys(), reverse=True): - words = TARGET_WORDS[prob] + for prob, words in sorted(TARGET_GROUPS, key=lambda item: item[0], reverse=True): for word in words: actual = summary["injection_stats"][word] expected = total_files * prob @@ -194,7 +210,7 @@ def main(): print( f" {word:<18s} actual={actual:>6d} " f"expected~{expected:>8.1f} " - f"rate={pct:.3f}% (target={prob * 100:.2f}%)" + f"rate={pct:.3f}% (target={prob * 100:.3f}%)" ) print() diff --git a/benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py b/benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py index db98c088a3..e0b9290e47 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py @@ -1,14 +1,12 @@ #!/usr/bin/env python3 -"""Step 3 (Performance): Benchmark grep performance and recall. +"""Step 3 (Performance): Benchmark grep latency and match count. -Runs grep queries against the synthetic dataset, measuring both latency -and recall. Ground truth (match counts per word) is obtained by running -grep with engine=fs on first run, then cached. +Runs grep queries against the synthetic dataset, measuring latency and +returned match count with a fixed node_limit. Run twice with different ov.conf engine settings to compare: 1. Set ov.conf: "grep": {"engine": "fs"}, restart, then: python3 step3_benchmark.py --engine-label fs - (This also generates the ground truth cache) 2. Set ov.conf: "grep": {"engine": "auto", "switch_to_remote_threshold": 0}, restart, then: python3 step3_benchmark.py --engine-label auto --compare step3_result_fs.json @@ -18,142 +16,31 @@ from __future__ import annotations import argparse -import hashlib import json import os -import re import time from openviking_cli.client.sync_http import SyncHTTPClient BASE_URI = "viking://resources/benchmark/performance" -DATA_DIR = os.path.expanduser("~/.openviking/data/benchmark/performance") -GROUND_TRUTH_DIR = os.path.join(DATA_DIR, ".ground_truth") -MISS_DIR = os.path.join(DATA_DIR, ".miss") SYNTHETIC_DIR = os.path.expanduser("~/.openviking/data/benchmark/synthetic") # Same target words as step0_prepare_data.py -TARGET_WORDS = { - 0.50: ["quantumnexus", "synapseflow", "deepvector"], - 0.10: ["bm25engine", "vikingcore", "retrievex"], - 0.001: ["zephyrhash", "cryptolattice", "nebulalink"], - 0.0001: ["xenoform", "quarkpulse", "omegabind"], -} +TARGET_GROUPS: list[tuple[float, list[str]]] = [ + (0.01, ["heliofract", "prismcache", "fluxkernel"]), + (0.001, ["auroracode", "kiteshade", "glyphvector"]), + (0.001, ["cortexmint", "latticewave", "spiralsync"]), + (0.0005, ["ripplehash", "embertrace", "novaframe"]), + (0.0001, ["zephyrloom", "quartzrelay", "nebulaindex"]), +] RUNS = 3 WARMUP = 1 +GREP_NODE_LIMIT = 256 -GROUND_TRUTH_DIR = os.path.join(DATA_DIR, ".ground_truth") -MISS_DIR = os.path.join(DATA_DIR, ".miss") -SYNTHETIC_DIR = os.path.expanduser("~/.openviking/data/benchmark/synthetic") - - -def _sanitize_filename(s: str, max_len: int = 40) -> str: - s = re.sub(r"[^a-zA-Z0-9_\-]", "_", s) - s = s.strip("_") - return s[:max_len] - - -def _perf_cache_path(uri: str) -> str: - h = hashlib.sha256(uri.encode("utf-8")).hexdigest()[:8] - return os.path.join(GROUND_TRUTH_DIR, f"perf_{h}.json") - - -def _load_ground_truth_cache(uri: str) -> dict[str, int] | None: - path = _perf_cache_path(uri) - if not os.path.isfile(path): - # Fallback: try old-style filename - old_h = hashlib.sha256(SYNTHETIC_DIR.encode("utf-8")) - for prob in sorted(TARGET_WORDS.keys()): - for word in TARGET_WORDS[prob]: - old_h.update(word.encode("utf-8")) - old_key = old_h.hexdigest()[:16] - old_path = os.path.join(GROUND_TRUTH_DIR, f"perf_{old_key}.json") - if os.path.isfile(old_path): - with open(old_path, encoding="utf-8") as f: - data = json.load(f) - return data.get("word_counts") - return None - with open(path, encoding="utf-8") as f: - data = json.load(f) - return data.get("word_counts") - - -def _save_ground_truth_cache(uri: str, word_counts: dict[str, int]) -> None: - os.makedirs(GROUND_TRUTH_DIR, exist_ok=True) - path = _perf_cache_path(uri) - with open(path, "w", encoding="utf-8") as f: - json.dump({"uri": uri, "word_counts": word_counts}, f, indent=2, ensure_ascii=False) - - -def _perf_miss_path(engine_label: str) -> str: - h = hashlib.sha256(BASE_URI.encode("utf-8")).hexdigest()[:8] - safe_label = _sanitize_filename(engine_label) - return os.path.join(MISS_DIR, f"perf_{safe_label}_{h}.json") - - -def _save_perf_miss( - engine_label: str, - results: list[dict], - ground_truth: dict[str, int], -) -> None: - """Save miss analysis (count diff per word) for performance benchmark.""" - miss_data: list[dict] = [] - has_miss = False - for r in results: - if "error" in r or "word" not in r: - continue - word = r["word"] - found = r.get("matches", 0) - expected = ground_truth.get(word, 0) - if found != expected or expected > 0: - miss_data.append( - { - "word": word, - "probability": r.get("probability"), - "expected": expected, - "found": found, - "diff": found - expected, - "recall_approx": r.get("recall_approx"), - } - ) - if found < expected: - has_miss = True - if not has_miss: - return - os.makedirs(MISS_DIR, exist_ok=True) - path = _perf_miss_path(engine_label) - with open(path, "w", encoding="utf-8") as f: - json.dump( - {"engine_label": engine_label, "uri": BASE_URI, "misses": miss_data}, - f, - indent=2, - ensure_ascii=False, - ) - - -def compute_ground_truth(client: SyncHTTPClient, uri: str) -> tuple[dict[str, int], float]: - """Compute ground truth via OV grep (fs engine). Returns word -> match count.""" - cached = _load_ground_truth_cache(uri) - if cached is not None: - return cached, 0.0 - all_words = [] - for prob in sorted(TARGET_WORDS.keys()): - all_words.extend(TARGET_WORDS[prob]) - - word_counts: dict[str, int] = {} - t0 = time.monotonic() - for w in all_words: - result = client.grep(uri=uri, pattern=w, node_limit=100000) - count = 0 - if isinstance(result, dict): - count = len(result.get("matches", [])) - word_counts[w] = count - elapsed = time.monotonic() - t0 - - _save_ground_truth_cache(uri, word_counts) - return word_counts, elapsed +def _format_probability(probability: float) -> str: + return f"{probability * 100:.3f}%" def count_local_files() -> int: @@ -170,7 +57,7 @@ def count_local_files() -> int: def run_grep(client: SyncHTTPClient, pattern: str, uri: str) -> tuple[float, int, set[str]]: start = time.monotonic() - result = client.grep(uri=uri, pattern=pattern, node_limit=100000) + result = client.grep(uri=uri, pattern=pattern, node_limit=GREP_NODE_LIMIT) elapsed = time.monotonic() - start match_uris: set[str] = set() if isinstance(result, dict): @@ -181,16 +68,13 @@ def run_grep(client: SyncHTTPClient, pattern: str, uri: str) -> tuple[float, int return elapsed, len(match_uris), match_uris -def benchmark_engine( - client: SyncHTTPClient, total_files: int, ground_truth: dict[str, int] -) -> list[dict]: +def benchmark_engine(client: SyncHTTPClient, total_files: int) -> list[dict]: results = [] - for prob in sorted(TARGET_WORDS.keys(), reverse=True): - words = TARGET_WORDS[prob] + for prob, words in sorted(TARGET_GROUPS, key=lambda item: item[0], reverse=True): for word in words: - expected = ground_truth.get(word, int(total_files * prob)) - label = f"{word} (p={prob * 100:.2f}%, expect~{expected})" + expected = int(total_files * prob) + label = f"{word} (p={_format_probability(prob)}, expect~{expected})" print(f" {label} ...", end=" ", flush=True) @@ -221,12 +105,7 @@ def benchmark_engine( avg_ms = sum(times) / len(times) * 1000 min_ms = min(times) * 1000 max_ms = max(times) * 1000 - # Recall: how many of the expected files were found - # This is approximate since injection is probabilistic - recall = match_count / expected if expected > 0 else 1.0 - print( - f"avg={avg_ms:.1f}ms matches={match_count} expected~{expected} recall~{recall:.2f}" - ) + print(f"avg={avg_ms:.1f}ms matches={match_count} expected~{expected}") results.append( { "label": label, @@ -237,7 +116,6 @@ def benchmark_engine( "max_ms": round(max_ms, 1), "matches": match_count, "expected_approx": expected, - "recall_approx": round(recall, 4), } ) @@ -308,14 +186,14 @@ def print_comparison( cmp = compare_by_word.get(word) if not cmp: print( - f"{word:<20} {r.get('probability', 0) * 100:>7.2f}% {'N/A':>14} {cur_ms:>14.1f} {'---':>10}" + f"{word:<20} {_format_probability(r.get('probability', 0)):>8} {'N/A':>14} {cur_ms:>14.1f} {'---':>10}" ) continue cmp_ms = cmp["avg_ms"] speedup = cmp_ms / cur_ms if cur_ms > 0 else float("inf") speedup_str = f"{speedup:.1f}x" print( - f"{word:<20} {r.get('probability', 0) * 100:>7.2f}% " + f"{word:<20} {_format_probability(r.get('probability', 0)):>8} " f"{cmp_ms:>14.1f} {cur_ms:>14.1f} {speedup_str:>10} " f"{cmp.get('matches', '?'):>12} {r.get('matches', '?'):>12}" ) @@ -341,55 +219,51 @@ def main(): client = SyncHTTPClient() client.initialize() - print("Computing ground truth (OV grep, fs engine)...") - ground_truth, gt_elapsed = compute_ground_truth(client, BASE_URI) - if gt_elapsed > 0: - print(f" Ground truth computed in {gt_elapsed:.1f}s") - else: - print(" Ground truth loaded from cache") - print("=" * 80) print(f"Step 3 (Performance): Grep Benchmark — engine={args.engine_label}") print("=" * 80) print(f" URI: {BASE_URI}") print(f" Total files: {total_files:,}") + print(f" Grep limit: {GREP_NODE_LIMIT}") print(f" Runs per test: {RUNS} (warmup: {WARMUP})") print() print("Ensure ov.conf has the desired grep config and the server is restarted.") print() try: - results = benchmark_engine(client, total_files, ground_truth) + results = benchmark_engine(client, total_files) finally: client.close() output_file = f"step3_result_{args.engine_label}.json" with open(output_file, "w", encoding="utf-8") as f: json.dump( - {"engine_label": args.engine_label, "total_files": total_files, "results": results}, + { + "engine_label": args.engine_label, + "total_files": total_files, + "grep_node_limit": GREP_NODE_LIMIT, + "results": results, + }, f, indent=2, ensure_ascii=False, ) print(f"\nResults saved to {output_file}") - # Save miss analysis - _save_perf_miss(args.engine_label, results, ground_truth) - print() print( - f"{'Word':<20} {'Prob':>8} {'Avg(ms)':>10} {'Min(ms)':>10} {'Max(ms)':>10} {'Matches':>10} {'Expect~':>10} {'Recall~':>10}" + f"{'Word':<20} {'Prob':>8} {'Avg(ms)':>10} {'Min(ms)':>10} {'Max(ms)':>10} {'Matches':>10} {'Expect~':>10}" ) - print("-" * 108) + print("-" * 96) for r in results: if "error" in r: print(f"{r.get('word', '?'):<20} {'FAILED':>10}") else: print( - f"{r['word']:<20} {r.get('probability', 0) * 100:>7.2f}% " + f"{r['word']:<20} {_format_probability(r.get('probability', 0)):>8} " f"{r['avg_ms']:>10.1f} {r['min_ms']:>10.1f} " f"{r['max_ms']:>10.1f} {r['matches']:>10} " - f"{r.get('expected_approx', '?'):>10} {r.get('recall_approx', '?'):>10}" + f"{r.get('expected_approx', '?'):>10}" ) print() @@ -403,9 +277,6 @@ def main(): prev_results = prev.get("results", []) print_comparison(args.engine_label, results, prev_label, prev_results) - print(f"\nMiss analysis saved to: {MISS_DIR}/") - print(f"Ground truth cache: {GROUND_TRUTH_DIR}/") - if __name__ == "__main__": main() diff --git a/docs/en/guides/01-configuration.md b/docs/en/guides/01-configuration.md index d7cbcdcbb4..1ea8c6c35f 100644 --- a/docs/en/guides/01-configuration.md +++ b/docs/en/guides/01-configuration.md @@ -888,7 +888,7 @@ Grep engine configuration for content pattern search. These settings are server- { "grep": { "engine": "auto", - "switch_to_remote_threshold": 1000 + "switch_to_remote_threshold": 10000 } } ``` @@ -896,9 +896,7 @@ Grep engine configuration for content pattern search. These settings are server- | Parameter | Type | Description | Default | |-----------|------|-------------|---------| | `engine` | str | Search engine mode: `"auto"` uses VikingDB BM25 recall when available and falls back to local filesystem search; `"fs"` forces local filesystem search only. | `"auto"` | -| `switch_to_remote_threshold` | int | L2 record count threshold to switch to VikingDB BM25 recall. When the number of L2 files under the search scope exceeds this threshold, VikingDB BM25 is used for phase-1 recall; otherwise local filesystem search is used. Set to `0` to always use VikingDB BM25. Must be ≥ 0. | `1000` | - -The per-request parameter `remote_return_limit` (0–100000, default `0` = auto-adapt to 100000) controls the maximum number of files recalled by VikingDB BM25 in a single grep call. +| `switch_to_remote_threshold` | int | L2 record count threshold to switch to VikingDB BM25 recall. When the number of L2 files under the search scope exceeds this threshold, VikingDB BM25 is used for phase-1 recall; otherwise local filesystem search is used. Set to `0` to always use VikingDB BM25. Must be ≥ 0. | `10000` | ### storage diff --git a/docs/zh/guides/01-configuration.md b/docs/zh/guides/01-configuration.md index 4033c8a31a..d8b909a8bc 100644 --- a/docs/zh/guides/01-configuration.md +++ b/docs/zh/guides/01-configuration.md @@ -859,7 +859,7 @@ Grep 引擎配置,用于内容模式搜索。这些设置为服务端配置, { "grep": { "engine": "auto", - "switch_to_remote_threshold": 1000 + "switch_to_remote_threshold": 10000 } } ``` @@ -867,9 +867,7 @@ Grep 引擎配置,用于内容模式搜索。这些设置为服务端配置, | 参数 | 类型 | 说明 | 默认值 | |------|------|------|--------| | `engine` | str | 搜索引擎模式:`"auto"` 在可用时使用 VikingDB BM25 召回,不可用时回退到本地文件系统搜索;`"fs"` 强制仅使用本地文件系统搜索。 | `"auto"` | -| `switch_to_remote_threshold` | int | 切换到 VikingDB BM25 召回的 L2 记录数阈值。当搜索范围内的 L2 文件数超过此阈值时,使用 VikingDB BM25 进行第一阶段召回;否则使用本地文件系统搜索。设为 `0` 表示始终使用 VikingDB BM25。必须 ≥ 0。 | `1000` | - -请求级别参数 `remote_return_limit`(0–100000,默认 `0` = 自适应调整为 100000)控制单次 grep 调用中 VikingDB BM25 召回的最大文件数。 +| `switch_to_remote_threshold` | int | 切换到 VikingDB BM25 召回的 L2 记录数阈值。当搜索范围内的 L2 文件数超过此阈值时,使用 VikingDB BM25 进行第一阶段召回;否则使用本地文件系统搜索。设为 `0` 表示始终使用 VikingDB BM25。必须 ≥ 0。 | `10000` | ### storage diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index e6cf6cb5a8..029c1d7d33 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -929,12 +929,11 @@ async def _grep_vikingdb_then_fs( query = " ".join(kw.strip() for kw in pattern.split("|") if kw.strip()) filter_expr = PathScope("uri", uri, depth=level_limit) - # Auto-adapt remote_return_limit: when 0 (default), use the maximum - # limit (100000) so that search_by_keywords returns all matching - # documents without truncation. The real cost is in phase 2 (local - # regex on recalled files), not in the bm25 recall itself. + # Auto-adapt remote_return_limit: when 0 (default), recall up to + # 5x requested matches while capping at VikingDB's max limit. If + # node_limit is unset, use the maximum limit to avoid truncation. if remote_return_limit == 0: - remote_return_limit = 100000 + remote_return_limit = min(node_limit * 5, 100000) if node_limit else 100000 # Step 1: vikingdb recall candidate files try: diff --git a/openviking_cli/utils/config/grep_config.py b/openviking_cli/utils/config/grep_config.py index 9e2564fe90..75c12c82d1 100644 --- a/openviking_cli/utils/config/grep_config.py +++ b/openviking_cli/utils/config/grep_config.py @@ -21,7 +21,7 @@ class GrepConfig(BaseModel): ) switch_to_remote_threshold: int = Field( - default=1000, + default=10000, ge=0, description=( "L2 record count threshold to switch to vikingdb; 0 means always use vikingdb." diff --git a/tests/storage/test_viking_fs_grep.py b/tests/storage/test_viking_fs_grep.py index 1922279d5d..89b98bad2d 100644 --- a/tests/storage/test_viking_fs_grep.py +++ b/tests/storage/test_viking_fs_grep.py @@ -7,12 +7,22 @@ import openviking.storage.viking_fs as viking_fs_module from openviking.storage.viking_fs import _DEFAULT_GREP_FILE_CONCURRENCY, VikingFS +from openviking_cli.utils.config.grep_config import GrepConfig class _DummyAgfs: pass +class _DummyVectorStore: + def __init__(self): + self.calls = [] + + async def search_by_keywords(self, **kwargs): + self.calls.append(kwargs) + return [] + + @pytest.fixture def fs(monkeypatch): viking_fs = VikingFS(agfs=_DummyAgfs()) @@ -34,6 +44,41 @@ async def _fake_stat(uri, ctx=None, skip_count=False): return {"name": uri.rsplit("/", 1)[-1], "isDir": True} +def test_grep_config_default_switch_to_remote_threshold_is_10000(): + assert GrepConfig().switch_to_remote_threshold == 10000 + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("node_limit", "expected_remote_limit"), + [ + (7, 35), + (None, 100000), + (50000, 100000), + ], +) +async def test_grep_vikingdb_auto_remote_limit_uses_five_times_node_limit( + monkeypatch, node_limit, expected_remote_limit +): + fs = VikingFS(agfs=_DummyAgfs()) + vector_store = _DummyVectorStore() + monkeypatch.setattr(fs, "_get_vector_store", lambda: vector_store) + + result = await fs._grep_vikingdb_then_fs( + uri="viking://resources", + pattern="needle", + exclude_uri=None, + case_insensitive=False, + node_limit=node_limit, + level_limit=10, + remote_return_limit=0, + ctx=None, + ) + + assert result == {"matches": [], "count": 0, "match_count": 0, "files_scanned": 0} + assert vector_store.calls[0]["limit"] == expected_remote_limit + + @pytest.mark.asyncio async def test_grep_preserves_dfs_order_and_node_limit(monkeypatch): fs = VikingFS(agfs=_DummyAgfs()) From f13513303709170a1ce26c36f378aaaccd3f884b Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Wed, 17 Jun 2026 19:57:18 +0800 Subject: [PATCH 26/31] optimize: benchmark client timeout --- benchmark/retrieval/grep/vikingdb_bm25/README.md | 10 +++++----- benchmark/retrieval/grep/vikingdb_bm25/README_CN.md | 8 ++++---- .../grep/vikingdb_bm25/performance/step3_benchmark.py | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/benchmark/retrieval/grep/vikingdb_bm25/README.md b/benchmark/retrieval/grep/vikingdb_bm25/README.md index f845b68f71..0f101fa97b 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/README.md +++ b/benchmark/retrieval/grep/vikingdb_bm25/README.md @@ -10,7 +10,7 @@ vikingdb_bm25/ ├── effectiveness/ # Retrieval effectiveness (recall/precision/F1) │ ├── step1_add_resource.py │ └── step2_quality.py -└── performance/ # Retrieval performance (latency + recall at scale) +└── performance/ # Retrieval performance (latency + returned match count at scale) ├── step0_prepare_data.py ├── step1_add_resource.py ├── step2_reindex.py @@ -66,9 +66,8 @@ Tests grep speed and returned match count on a large synthetic dataset (default: 15 words across 5 probability tiers: -| Probability | Words | Expected hits (per 100K files) | +| Probability | Words | Expected hits (per 200K files) | |-------------|-------|-------------------------------| -| 50% | quantumnexus, synapseflow, deepvector | ~50,000 | | 1% | heliofract, prismcache, fluxkernel | ~2,000 | | 0.1% | auroracode, kiteshade, glyphvector | ~200 | | 0.1% | cortexmint, latticewave, spiralsync | ~200 | @@ -109,8 +108,9 @@ python3 step3_benchmark.py --engine-label auto --compare step3_result_fs.json - **Effectiveness** tests compare grep results against ground truth from fs-engine grep (cached locally) - **Performance** tests compare grep latency and returned match counts between engine configs; no ground truth is generated -- Both follow the same workflow: import (with indexing) → benchmark/evaluate -- Both support **resumable** execution via progress files (separate for import and reindex) +- **Effectiveness** imports real repos with indexing in a single step, then evaluates quality +- **Performance** imports synthetic data without indexing, builds vector indexes asynchronously, then benchmarks latency +- **Performance** import/reindex steps support resumable execution via progress files - Change grep engine via `ov.conf` and restart the server between benchmark runs - To horizontally scale the synthetic dataset, run Step 0 again with a new `--start-dir`, then rerun Step 1 and Step 2. diff --git a/benchmark/retrieval/grep/vikingdb_bm25/README_CN.md b/benchmark/retrieval/grep/vikingdb_bm25/README_CN.md index 351550741c..9d758ae2cb 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/README_CN.md +++ b/benchmark/retrieval/grep/vikingdb_bm25/README_CN.md @@ -10,7 +10,7 @@ vikingdb_bm25/ ├── effectiveness/ # 检索效果测试(召回率/精确率/F1) │ ├── step1_add_resource.py │ └── step2_quality.py -└── performance/ # 检索性能测试(延迟 + 大规模召回) +└── performance/ # 检索性能测试(延迟 + 大规模返回匹配数) ├── step0_prepare_data.py ├── step1_add_resource.py ├── step2_reindex.py @@ -68,7 +68,6 @@ python3 step2_quality.py | 概率 | 单词 | 预期命中数(每 20 万文件) | |------|------|---------------------------| -| 50% | quantumnexus, synapseflow, deepvector | ~50,000 | | 1% | heliofract, prismcache, fluxkernel | ~2,000 | | 0.1% | auroracode, kiteshade, glyphvector | ~200 | | 0.1% | cortexmint, latticewave, spiralsync | ~200 | @@ -109,7 +108,8 @@ python3 step3_benchmark.py --engine-label auto --compare step3_result_fs.json - **Effectiveness(效果测试)** 将 grep 结果与 fs 引擎的 ground truth 对比(本地缓存) - **Performance(性能测试)** 对比不同引擎的延迟和返回匹配数,不生成 ground truth -- 两者遵循相同流程:导入(含建索引)→ 评估/测试 -- 两者均支持**断点续传**(导入和索引各有独立进度文件) +- **Effectiveness** 直接一次性导入真实代码仓并建索引,然后执行效果评估 +- **Performance** 先导入合成数据(不建索引),再异步建向量索引,最后执行延迟基准测试 +- **Performance** 的导入与 reindex 步骤支持**断点续传**(各有独立进度文件) - 切换 grep 引擎需修改 `ov.conf` 并重启服务,在不同运行之间对比 - 如需水平扩展合成数据集,可用新的 `--start-dir` 再运行步骤 0,然后重跑步骤 1 和步骤 2。 diff --git a/benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py b/benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py index e0b9290e47..c756f325ca 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py +++ b/benchmark/retrieval/grep/vikingdb_bm25/performance/step3_benchmark.py @@ -216,7 +216,7 @@ def main(): total_files = count_local_files() - client = SyncHTTPClient() + client = SyncHTTPClient(timeout=3600) client.initialize() print("=" * 80) From 4ec3ec92877da7175bc1bafc6895407144ffbf56 Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Wed, 17 Jun 2026 21:22:08 +0800 Subject: [PATCH 27/31] update README --- benchmark/retrieval/grep/vikingdb_bm25/README.md | 2 ++ benchmark/retrieval/grep/vikingdb_bm25/README_CN.md | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/benchmark/retrieval/grep/vikingdb_bm25/README.md b/benchmark/retrieval/grep/vikingdb_bm25/README.md index 0f101fa97b..fb78244d4c 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/README.md +++ b/benchmark/retrieval/grep/vikingdb_bm25/README.md @@ -66,6 +66,8 @@ Tests grep speed and returned match count on a large synthetic dataset (default: 15 words across 5 probability tiers: +These word groups are defined in `performance/step0_prepare_data.py` and reused by `performance/step3_benchmark.py`. + | Probability | Words | Expected hits (per 200K files) | |-------------|-------|-------------------------------| | 1% | heliofract, prismcache, fluxkernel | ~2,000 | diff --git a/benchmark/retrieval/grep/vikingdb_bm25/README_CN.md b/benchmark/retrieval/grep/vikingdb_bm25/README_CN.md index 9d758ae2cb..30fb446a36 100644 --- a/benchmark/retrieval/grep/vikingdb_bm25/README_CN.md +++ b/benchmark/retrieval/grep/vikingdb_bm25/README_CN.md @@ -39,12 +39,12 @@ python3 step1_add_resource.py --source ~/.openviking/data/benchmark/OpenViking-m # 首次运行必须使用 engine=fs 生成 ground truth 缓存: # 1. 设置 ov.conf: "grep": {"engine": "fs"} # 2. 重启服务 -python3 step2_quality.py +python3 step2_quality.py --keywords grep reindex SyncHTTPClient # 后续运行可使用任意引擎(ground truth 从缓存读取): # 1. 设置 ov.conf: "grep": {"engine": "auto", "switch_to_remote_threshold": 0} # 2. 重启服务 -python3 step2_quality.py +python3 step2_quality.py --keywords grep reindex SyncHTTPClient # 可选参数:--regenerate-ground-truth (强制重算,需 engine=fs) ``` @@ -66,6 +66,8 @@ python3 step2_quality.py 15 个单词,分 5 个概率层级: +这些词组定义在 `performance/step0_prepare_data.py` 中,并由 `performance/step3_benchmark.py` 复用。 + | 概率 | 单词 | 预期命中数(每 20 万文件) | |------|------|---------------------------| | 1% | heliofract, prismcache, fluxkernel | ~2,000 | From 98424e1e4c4953e02990b4e1afa551f4b64a8ac6 Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Tue, 23 Jun 2026 17:51:17 +0800 Subject: [PATCH 28/31] fix: rm unused param --- crates/ov_cli/src/client.rs | 6 +- crates/ov_cli/src/commands/search.rs | 2 - crates/ov_cli/src/handlers.rs | 2 - crates/ov_cli/src/main.rs | 5 -- openviking/async_client.py | 2 - openviking/client/local.py | 2 - openviking/server/routers/search.py | 9 +- openviking/service/fs_service.py | 2 - openviking/storage/viking_fs.py | 23 ++--- openviking/sync_client.py | 2 - openviking_cli/client/base.py | 1 - openviking_cli/client/http.py | 1 - tests/server/test_api_code.py | 1 - tests/storage/test_viking_fs_grep.py | 1 - uv.lock | 129 ++++++++++++++------------- 15 files changed, 75 insertions(+), 113 deletions(-) diff --git a/crates/ov_cli/src/client.rs b/crates/ov_cli/src/client.rs index c0b48ced69..7078797a78 100644 --- a/crates/ov_cli/src/client.rs +++ b/crates/ov_cli/src/client.rs @@ -489,9 +489,8 @@ impl HttpClient { ignore_case: bool, node_limit: i32, level_limit: i32, - remote_return_limit: Option, ) -> Result { - let mut body = serde_json::json!({ + let body = serde_json::json!({ "uri": uri, "exclude_uri": exclude_uri, "pattern": pattern, @@ -499,9 +498,6 @@ impl HttpClient { "node_limit": node_limit, "level_limit": level_limit, }); - if let Some(limit) = remote_return_limit { - body["remote_return_limit"] = serde_json::json!(limit); - } self.post("/api/v1/search/grep", &body).await } diff --git a/crates/ov_cli/src/commands/search.rs b/crates/ov_cli/src/commands/search.rs index 3648c2b25c..9d6142acd9 100644 --- a/crates/ov_cli/src/commands/search.rs +++ b/crates/ov_cli/src/commands/search.rs @@ -443,7 +443,6 @@ pub async fn grep( ignore_case: bool, node_limit: i32, level_limit: i32, - remote_return_limit: Option, output_format: OutputFormat, compact: bool, ) -> Result<()> { @@ -455,7 +454,6 @@ pub async fn grep( ignore_case, node_limit, level_limit, - remote_return_limit, ) .await?; output_grep_results(&result, output_format, compact); diff --git a/crates/ov_cli/src/handlers.rs b/crates/ov_cli/src/handlers.rs index 3ef3e51e86..e71f0ee2fe 100644 --- a/crates/ov_cli/src/handlers.rs +++ b/crates/ov_cli/src/handlers.rs @@ -1501,7 +1501,6 @@ pub async fn handle_grep( ignore_case: bool, node_limit: i32, level_limit: i32, - remote_return_limit: Option, ctx: CliContext, ) -> Result<()> { // Prevent grep from root directory to avoid excessive server load and timeouts @@ -1533,7 +1532,6 @@ pub async fn handle_grep( ignore_case, node_limit, level_limit, - remote_return_limit, ctx.output_format, ctx.compact, ) diff --git a/crates/ov_cli/src/main.rs b/crates/ov_cli/src/main.rs index 79a0f3f617..0a453c384f 100644 --- a/crates/ov_cli/src/main.rs +++ b/crates/ov_cli/src/main.rs @@ -704,9 +704,6 @@ enum Commands { help_heading = "Advanced options" )] level_limit: i32, - /// Maximum files recalled by vikingdb bm25; 0 means auto-adapt (0-100000) - #[arg(long = "remote-return-limit", default_value = "0")] - remote_return_limit: Option, }, /// [Data] Run file glob pattern search Glob { @@ -2964,7 +2961,6 @@ async fn main() { ignore_case, node_limit, level_limit, - remote_return_limit, } => { handlers::handle_grep( uri, @@ -2973,7 +2969,6 @@ async fn main() { ignore_case, node_limit, level_limit, - remote_return_limit, ctx, ) .await diff --git a/openviking/async_client.py b/openviking/async_client.py index 6d06b42838..2276174cc4 100644 --- a/openviking/async_client.py +++ b/openviking/async_client.py @@ -560,7 +560,6 @@ async def grep( node_limit: Optional[int] = None, exclude_uri: Optional[str] = None, level_limit: int = 5, - remote_return_limit: int = 0, ) -> Dict: """Content search""" await self._ensure_initialized() @@ -571,7 +570,6 @@ async def grep( node_limit=node_limit, exclude_uri=exclude_uri, level_limit=level_limit, - remote_return_limit=remote_return_limit, ) async def glob(self, pattern: str, uri: str = "viking://") -> Dict: diff --git a/openviking/client/local.py b/openviking/client/local.py index a551d50713..2741a1cc56 100644 --- a/openviking/client/local.py +++ b/openviking/client/local.py @@ -444,7 +444,6 @@ async def grep( node_limit: Optional[int] = None, exclude_uri: Optional[str] = None, level_limit: int = 5, - remote_return_limit: int = 0, ) -> Dict[str, Any]: """Content search with pattern.""" return await self._service.fs.grep( @@ -455,7 +454,6 @@ async def grep( node_limit=node_limit, exclude_uri=exclude_uri, level_limit=level_limit, - remote_return_limit=remote_return_limit, ) async def glob(self, pattern: str, uri: str = "viking://") -> Dict[str, Any]: diff --git a/openviking/server/routers/search.py b/openviking/server/routers/search.py index a6f1dc2ce2..8c1f06196d 100644 --- a/openviking/server/routers/search.py +++ b/openviking/server/routers/search.py @@ -7,7 +7,7 @@ from typing import Any, Dict, List, Literal, Optional, Union from fastapi import APIRouter, Depends -from pydantic import BaseModel, ConfigDict, Field, model_validator +from pydantic import BaseModel, ConfigDict, model_validator from openviking.core.path_variables import resolve_path_variables from openviking.core.peer_id import normalize_peer_selector @@ -177,12 +177,6 @@ class GrepRequest(BaseModel): case_insensitive: bool = False node_limit: Optional[int] = None level_limit: int = 10 - remote_return_limit: int = Field( - default=0, - ge=0, - le=100000, - description="Maximum files recalled by vikingdb bm25; 0 means auto-adapt", - ) class GlobRequest(BaseModel): @@ -306,7 +300,6 @@ async def grep( case_insensitive=request.case_insensitive, node_limit=request.node_limit, level_limit=request.level_limit, - remote_return_limit=request.remote_return_limit, ) except AGFSNotFoundError: raise NotFoundError(resolved_uri, "file") diff --git a/openviking/service/fs_service.py b/openviking/service/fs_service.py index cfb2da6b56..92995a21e9 100644 --- a/openviking/service/fs_service.py +++ b/openviking/service/fs_service.py @@ -457,7 +457,6 @@ async def grep( case_insensitive: bool = False, node_limit: Optional[int] = None, level_limit: int = 10, - remote_return_limit: int = 0, ) -> Dict: """Content search.""" viking_fs = self._ensure_initialized() @@ -470,7 +469,6 @@ async def grep( case_insensitive=case_insensitive, node_limit=node_limit, level_limit=level_limit, - remote_return_limit=remote_return_limit, ctx=ctx, ) diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index fd009d2bad..3f34bc9ae6 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -761,7 +761,6 @@ async def grep( node_limit: Optional[int] = None, level_limit: int = 10, ctx: Optional[RequestContext] = None, - remote_return_limit: int = 0, ) -> Dict: """Content search by pattern or keywords. @@ -780,9 +779,8 @@ async def grep( node_limit: Maximum number of results to return level_limit: Maximum depth level to traverse (default: 5) ctx: Request context - remote_return_limit: Maximum files recalled by vikingdb bm25. - 0 means auto-adapt: use maximum limit (100000) to avoid - truncating bm25 recall results (default: 0, max: 100000) + Internal bm25 recall limit is auto-adapted from node_limit as + min(node_limit * 5, 100000); when node_limit is unset, use 100000. Returns: Dict with matches, count, match_count, files_scanned @@ -792,12 +790,6 @@ async def grep( # and avoiding it saves one VikingDB API call. await self.stat(uri, ctx=ctx, skip_count=True) - # Clamp remote_return_limit to valid range (0 = auto, 1-100000 = explicit) - if remote_return_limit < 0: - remote_return_limit = 0 - elif remote_return_limit > 0: - remote_return_limit = max(1, min(remote_return_limit, 100000)) - # Read engine and threshold from grep_config (ov.conf) engine = self.grep_config.engine if self.grep_config else "auto" switch_to_remote_threshold = ( @@ -826,7 +818,6 @@ async def grep( case_insensitive=case_insensitive, node_limit=node_limit, level_limit=level_limit, - remote_return_limit=remote_return_limit, ctx=ctx, ) @@ -955,7 +946,6 @@ async def _grep_vikingdb_then_fs( case_insensitive, node_limit, level_limit, - remote_return_limit, ctx, ): """VikingDB bm25 recall + local fs precise matching.""" @@ -967,11 +957,10 @@ async def _grep_vikingdb_then_fs( query = " ".join(kw.strip() for kw in pattern.split("|") if kw.strip()) filter_expr = PathScope("uri", uri, depth=level_limit) - # Auto-adapt remote_return_limit: when 0 (default), recall up to - # 5x requested matches while capping at VikingDB's max limit. If - # node_limit is unset, use the maximum limit to avoid truncation. - if remote_return_limit == 0: - remote_return_limit = min(node_limit * 5, 100000) if node_limit else 100000 + # Auto-adapt bm25 recall limit: recall up to 5x requested matches + # while capping at VikingDB's max limit. If node_limit is unset, + # use the maximum limit to avoid truncation. + remote_return_limit = min(node_limit * 5, 100000) if node_limit else 100000 # Step 1: vikingdb recall candidate files try: diff --git a/openviking/sync_client.py b/openviking/sync_client.py index a40e4400b7..a7d60e0720 100644 --- a/openviking/sync_client.py +++ b/openviking/sync_client.py @@ -463,7 +463,6 @@ def grep( node_limit: Optional[int] = None, exclude_uri: Optional[str] = None, level_limit: int = 5, - remote_return_limit: int = 0, ) -> Dict: """Content search""" return run_async( @@ -474,7 +473,6 @@ def grep( node_limit, exclude_uri, level_limit, - remote_return_limit, ) ) diff --git a/openviking_cli/client/base.py b/openviking_cli/client/base.py index d37ca3163b..7b844cb5c9 100644 --- a/openviking_cli/client/base.py +++ b/openviking_cli/client/base.py @@ -219,7 +219,6 @@ async def grep( exclude_uri: Optional[str] = None, node_limit: Optional[int] = None, level_limit: int = 5, - remote_return_limit: int = 0, ) -> Dict[str, Any]: """Content search with pattern.""" ... diff --git a/openviking_cli/client/http.py b/openviking_cli/client/http.py index 610002a3f6..b0b57206b5 100644 --- a/openviking_cli/client/http.py +++ b/openviking_cli/client/http.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: AGPL-3.0 """Compatibility shim for the legacy HTTP client import path.""" - from openviking_cli.client._http_compat import ERROR_CODE_TO_EXCEPTION, AsyncHTTPClient __all__ = ["AsyncHTTPClient", "ERROR_CODE_TO_EXCEPTION"] diff --git a/tests/server/test_api_code.py b/tests/server/test_api_code.py index 25aabefc01..9dabebabb3 100644 --- a/tests/server/test_api_code.py +++ b/tests/server/test_api_code.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: AGPL-3.0 """Tests for /api/v1/code/* endpoints.""" - from openviking_cli.exceptions import PermissionDeniedError PY_SAMPLE = '''"""Module top doc.""" diff --git a/tests/storage/test_viking_fs_grep.py b/tests/storage/test_viking_fs_grep.py index 89b98bad2d..11c7f10e6d 100644 --- a/tests/storage/test_viking_fs_grep.py +++ b/tests/storage/test_viking_fs_grep.py @@ -71,7 +71,6 @@ async def test_grep_vikingdb_auto_remote_limit_uses_five_times_node_limit( case_insensitive=False, node_limit=node_limit, level_limit=10, - remote_return_limit=0, ctx=None, ) diff --git a/uv.lock b/uv.lock index fd1d9c03a1..fda63669c8 100644 --- a/uv.lock +++ b/uv.lock @@ -911,62 +911,59 @@ wheels = [ [[package]] name = "cryptography" -version = "46.0.5" +version = "49.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cffi", marker = "platform_python_implementation != 'PyPy'" }, { name = "typing-extensions", marker = "python_full_version < '3.11'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/60/04/ee2a9e8542e4fa2773b81771ff8349ff19cdd56b7258a0cc442639052edb/cryptography-46.0.5.tar.gz", hash = "sha256:abace499247268e3757271b2f1e244b36b06f8515cf27c4d49468fc9eb16e93d", size = 750064, upload-time = "2026-02-10T19:18:38.255Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f7/81/b0bb27f2ba931a65409c6b8a8b358a7f03c0e46eceacddff55f7c84b1f3b/cryptography-46.0.5-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:351695ada9ea9618b3500b490ad54c739860883df6c1f555e088eaf25b1bbaad", size = 7176289, upload-time = "2026-02-10T19:17:08.274Z" }, - { url = "https://files.pythonhosted.org/packages/ff/9e/6b4397a3e3d15123de3b1806ef342522393d50736c13b20ec4c9ea6693a6/cryptography-46.0.5-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c18ff11e86df2e28854939acde2d003f7984f721eba450b56a200ad90eeb0e6b", size = 4275637, upload-time = "2026-02-10T19:17:10.53Z" }, - { url = "https://files.pythonhosted.org/packages/63/e7/471ab61099a3920b0c77852ea3f0ea611c9702f651600397ac567848b897/cryptography-46.0.5-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d7e3d356b8cd4ea5aff04f129d5f66ebdc7b6f8eae802b93739ed520c47c79b", size = 4424742, upload-time = "2026-02-10T19:17:12.388Z" }, - { url = "https://files.pythonhosted.org/packages/37/53/a18500f270342d66bf7e4d9f091114e31e5ee9e7375a5aba2e85a91e0044/cryptography-46.0.5-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:50bfb6925eff619c9c023b967d5b77a54e04256c4281b0e21336a130cd7fc263", size = 4277528, upload-time = "2026-02-10T19:17:13.853Z" }, - { url = "https://files.pythonhosted.org/packages/22/29/c2e812ebc38c57b40e7c583895e73c8c5adb4d1e4a0cc4c5a4fdab2b1acc/cryptography-46.0.5-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:803812e111e75d1aa73690d2facc295eaefd4439be1023fefc4995eaea2af90d", size = 4947993, upload-time = "2026-02-10T19:17:15.618Z" }, - { url = "https://files.pythonhosted.org/packages/6b/e7/237155ae19a9023de7e30ec64e5d99a9431a567407ac21170a046d22a5a3/cryptography-46.0.5-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3ee190460e2fbe447175cda91b88b84ae8322a104fc27766ad09428754a618ed", size = 4456855, upload-time = "2026-02-10T19:17:17.221Z" }, - { url = "https://files.pythonhosted.org/packages/2d/87/fc628a7ad85b81206738abbd213b07702bcbdada1dd43f72236ef3cffbb5/cryptography-46.0.5-cp311-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:f145bba11b878005c496e93e257c1e88f154d278d2638e6450d17e0f31e558d2", size = 3984635, upload-time = "2026-02-10T19:17:18.792Z" }, - { url = "https://files.pythonhosted.org/packages/84/29/65b55622bde135aedf4565dc509d99b560ee4095e56989e815f8fd2aa910/cryptography-46.0.5-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:e9251e3be159d1020c4030bd2e5f84d6a43fe54b6c19c12f51cde9542a2817b2", size = 4277038, upload-time = "2026-02-10T19:17:20.256Z" }, - { url = "https://files.pythonhosted.org/packages/bc/36/45e76c68d7311432741faf1fbf7fac8a196a0a735ca21f504c75d37e2558/cryptography-46.0.5-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:47fb8a66058b80e509c47118ef8a75d14c455e81ac369050f20ba0d23e77fee0", size = 4912181, upload-time = "2026-02-10T19:17:21.825Z" }, - { url = "https://files.pythonhosted.org/packages/6d/1a/c1ba8fead184d6e3d5afcf03d569acac5ad063f3ac9fb7258af158f7e378/cryptography-46.0.5-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:4c3341037c136030cb46e4b1e17b7418ea4cbd9dd207e4a6f3b2b24e0d4ac731", size = 4456482, upload-time = "2026-02-10T19:17:25.133Z" }, - { url = "https://files.pythonhosted.org/packages/f9/e5/3fb22e37f66827ced3b902cf895e6a6bc1d095b5b26be26bd13c441fdf19/cryptography-46.0.5-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:890bcb4abd5a2d3f852196437129eb3667d62630333aacc13dfd470fad3aaa82", size = 4405497, upload-time = "2026-02-10T19:17:26.66Z" }, - { url = "https://files.pythonhosted.org/packages/1a/df/9d58bb32b1121a8a2f27383fabae4d63080c7ca60b9b5c88be742be04ee7/cryptography-46.0.5-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:80a8d7bfdf38f87ca30a5391c0c9ce4ed2926918e017c29ddf643d0ed2778ea1", size = 4667819, upload-time = "2026-02-10T19:17:28.569Z" }, - { url = "https://files.pythonhosted.org/packages/ea/ed/325d2a490c5e94038cdb0117da9397ece1f11201f425c4e9c57fe5b9f08b/cryptography-46.0.5-cp311-abi3-win32.whl", hash = "sha256:60ee7e19e95104d4c03871d7d7dfb3d22ef8a9b9c6778c94e1c8fcc8365afd48", size = 3028230, upload-time = "2026-02-10T19:17:30.518Z" }, - { url = "https://files.pythonhosted.org/packages/e9/5a/ac0f49e48063ab4255d9e3b79f5def51697fce1a95ea1370f03dc9db76f6/cryptography-46.0.5-cp311-abi3-win_amd64.whl", hash = "sha256:38946c54b16c885c72c4f59846be9743d699eee2b69b6988e0a00a01f46a61a4", size = 3480909, upload-time = "2026-02-10T19:17:32.083Z" }, - { url = "https://files.pythonhosted.org/packages/00/13/3d278bfa7a15a96b9dc22db5a12ad1e48a9eb3d40e1827ef66a5df75d0d0/cryptography-46.0.5-cp314-cp314t-macosx_10_9_universal2.whl", hash = "sha256:94a76daa32eb78d61339aff7952ea819b1734b46f73646a07decb40e5b3448e2", size = 7119287, upload-time = "2026-02-10T19:17:33.801Z" }, - { url = "https://files.pythonhosted.org/packages/67/c8/581a6702e14f0898a0848105cbefd20c058099e2c2d22ef4e476dfec75d7/cryptography-46.0.5-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5be7bf2fb40769e05739dd0046e7b26f9d4670badc7b032d6ce4db64dddc0678", size = 4265728, upload-time = "2026-02-10T19:17:35.569Z" }, - { url = "https://files.pythonhosted.org/packages/dd/4a/ba1a65ce8fc65435e5a849558379896c957870dd64fecea97b1ad5f46a37/cryptography-46.0.5-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fe346b143ff9685e40192a4960938545c699054ba11d4f9029f94751e3f71d87", size = 4408287, upload-time = "2026-02-10T19:17:36.938Z" }, - { url = "https://files.pythonhosted.org/packages/f8/67/8ffdbf7b65ed1ac224d1c2df3943553766914a8ca718747ee3871da6107e/cryptography-46.0.5-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:c69fd885df7d089548a42d5ec05be26050ebcd2283d89b3d30676eb32ff87dee", size = 4270291, upload-time = "2026-02-10T19:17:38.748Z" }, - { url = "https://files.pythonhosted.org/packages/f8/e5/f52377ee93bc2f2bba55a41a886fd208c15276ffbd2569f2ddc89d50e2c5/cryptography-46.0.5-cp314-cp314t-manylinux_2_28_ppc64le.whl", hash = "sha256:8293f3dea7fc929ef7240796ba231413afa7b68ce38fd21da2995549f5961981", size = 4927539, upload-time = "2026-02-10T19:17:40.241Z" }, - { url = "https://files.pythonhosted.org/packages/3b/02/cfe39181b02419bbbbcf3abdd16c1c5c8541f03ca8bda240debc467d5a12/cryptography-46.0.5-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:1abfdb89b41c3be0365328a410baa9df3ff8a9110fb75e7b52e66803ddabc9a9", size = 4442199, upload-time = "2026-02-10T19:17:41.789Z" }, - { url = "https://files.pythonhosted.org/packages/c0/96/2fcaeb4873e536cf71421a388a6c11b5bc846e986b2b069c79363dc1648e/cryptography-46.0.5-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:d66e421495fdb797610a08f43b05269e0a5ea7f5e652a89bfd5a7d3c1dee3648", size = 3960131, upload-time = "2026-02-10T19:17:43.379Z" }, - { url = "https://files.pythonhosted.org/packages/d8/d2/b27631f401ddd644e94c5cf33c9a4069f72011821cf3dc7309546b0642a0/cryptography-46.0.5-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:4e817a8920bfbcff8940ecfd60f23d01836408242b30f1a708d93198393a80b4", size = 4270072, upload-time = "2026-02-10T19:17:45.481Z" }, - { url = "https://files.pythonhosted.org/packages/f4/a7/60d32b0370dae0b4ebe55ffa10e8599a2a59935b5ece1b9f06edb73abdeb/cryptography-46.0.5-cp314-cp314t-manylinux_2_34_ppc64le.whl", hash = "sha256:68f68d13f2e1cb95163fa3b4db4bf9a159a418f5f6e7242564fc75fcae667fd0", size = 4892170, upload-time = "2026-02-10T19:17:46.997Z" }, - { url = "https://files.pythonhosted.org/packages/d2/b9/cf73ddf8ef1164330eb0b199a589103c363afa0cf794218c24d524a58eab/cryptography-46.0.5-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:a3d1fae9863299076f05cb8a778c467578262fae09f9dc0ee9b12eb4268ce663", size = 4441741, upload-time = "2026-02-10T19:17:48.661Z" }, - { url = "https://files.pythonhosted.org/packages/5f/eb/eee00b28c84c726fe8fa0158c65afe312d9c3b78d9d01daf700f1f6e37ff/cryptography-46.0.5-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c4143987a42a2397f2fc3b4d7e3a7d313fbe684f67ff443999e803dd75a76826", size = 4396728, upload-time = "2026-02-10T19:17:50.058Z" }, - { url = "https://files.pythonhosted.org/packages/65/f4/6bc1a9ed5aef7145045114b75b77c2a8261b4d38717bd8dea111a63c3442/cryptography-46.0.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:7d731d4b107030987fd61a7f8ab512b25b53cef8f233a97379ede116f30eb67d", size = 4652001, upload-time = "2026-02-10T19:17:51.54Z" }, - { url = "https://files.pythonhosted.org/packages/86/ef/5d00ef966ddd71ac2e6951d278884a84a40ffbd88948ef0e294b214ae9e4/cryptography-46.0.5-cp314-cp314t-win32.whl", hash = "sha256:c3bcce8521d785d510b2aad26ae2c966092b7daa8f45dd8f44734a104dc0bc1a", size = 3003637, upload-time = "2026-02-10T19:17:52.997Z" }, - { url = "https://files.pythonhosted.org/packages/b7/57/f3f4160123da6d098db78350fdfd9705057aad21de7388eacb2401dceab9/cryptography-46.0.5-cp314-cp314t-win_amd64.whl", hash = "sha256:4d8ae8659ab18c65ced284993c2265910f6c9e650189d4e3f68445ef82a810e4", size = 3469487, upload-time = "2026-02-10T19:17:54.549Z" }, - { url = "https://files.pythonhosted.org/packages/e2/fa/a66aa722105ad6a458bebd64086ca2b72cdd361fed31763d20390f6f1389/cryptography-46.0.5-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:4108d4c09fbbf2789d0c926eb4152ae1760d5a2d97612b92d508d96c861e4d31", size = 7170514, upload-time = "2026-02-10T19:17:56.267Z" }, - { url = "https://files.pythonhosted.org/packages/0f/04/c85bdeab78c8bc77b701bf0d9bdcf514c044e18a46dcff330df5448631b0/cryptography-46.0.5-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7d1f30a86d2757199cb2d56e48cce14deddf1f9c95f1ef1b64ee91ea43fe2e18", size = 4275349, upload-time = "2026-02-10T19:17:58.419Z" }, - { url = "https://files.pythonhosted.org/packages/5c/32/9b87132a2f91ee7f5223b091dc963055503e9b442c98fc0b8a5ca765fab0/cryptography-46.0.5-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:039917b0dc418bb9f6edce8a906572d69e74bd330b0b3fea4f79dab7f8ddd235", size = 4420667, upload-time = "2026-02-10T19:18:00.619Z" }, - { url = "https://files.pythonhosted.org/packages/a1/a6/a7cb7010bec4b7c5692ca6f024150371b295ee1c108bdc1c400e4c44562b/cryptography-46.0.5-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:ba2a27ff02f48193fc4daeadf8ad2590516fa3d0adeeb34336b96f7fa64c1e3a", size = 4276980, upload-time = "2026-02-10T19:18:02.379Z" }, - { url = "https://files.pythonhosted.org/packages/8e/7c/c4f45e0eeff9b91e3f12dbd0e165fcf2a38847288fcfd889deea99fb7b6d/cryptography-46.0.5-cp38-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:61aa400dce22cb001a98014f647dc21cda08f7915ceb95df0c9eaf84b4b6af76", size = 4939143, upload-time = "2026-02-10T19:18:03.964Z" }, - { url = "https://files.pythonhosted.org/packages/37/19/e1b8f964a834eddb44fa1b9a9976f4e414cbb7aa62809b6760c8803d22d1/cryptography-46.0.5-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3ce58ba46e1bc2aac4f7d9290223cead56743fa6ab94a5d53292ffaac6a91614", size = 4453674, upload-time = "2026-02-10T19:18:05.588Z" }, - { url = "https://files.pythonhosted.org/packages/db/ed/db15d3956f65264ca204625597c410d420e26530c4e2943e05a0d2f24d51/cryptography-46.0.5-cp38-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:420d0e909050490d04359e7fdb5ed7e667ca5c3c402b809ae2563d7e66a92229", size = 3978801, upload-time = "2026-02-10T19:18:07.167Z" }, - { url = "https://files.pythonhosted.org/packages/41/e2/df40a31d82df0a70a0daf69791f91dbb70e47644c58581d654879b382d11/cryptography-46.0.5-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:582f5fcd2afa31622f317f80426a027f30dc792e9c80ffee87b993200ea115f1", size = 4276755, upload-time = "2026-02-10T19:18:09.813Z" }, - { url = "https://files.pythonhosted.org/packages/33/45/726809d1176959f4a896b86907b98ff4391a8aa29c0aaaf9450a8a10630e/cryptography-46.0.5-cp38-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:bfd56bb4b37ed4f330b82402f6f435845a5f5648edf1ad497da51a8452d5d62d", size = 4901539, upload-time = "2026-02-10T19:18:11.263Z" }, - { url = "https://files.pythonhosted.org/packages/99/0f/a3076874e9c88ecb2ecc31382f6e7c21b428ede6f55aafa1aa272613e3cd/cryptography-46.0.5-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:a3d507bb6a513ca96ba84443226af944b0f7f47dcc9a399d110cd6146481d24c", size = 4452794, upload-time = "2026-02-10T19:18:12.914Z" }, - { url = "https://files.pythonhosted.org/packages/02/ef/ffeb542d3683d24194a38f66ca17c0a4b8bf10631feef44a7ef64e631b1a/cryptography-46.0.5-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9f16fbdf4da055efb21c22d81b89f155f02ba420558db21288b3d0035bafd5f4", size = 4404160, upload-time = "2026-02-10T19:18:14.375Z" }, - { url = "https://files.pythonhosted.org/packages/96/93/682d2b43c1d5f1406ed048f377c0fc9fc8f7b0447a478d5c65ab3d3a66eb/cryptography-46.0.5-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ced80795227d70549a411a4ab66e8ce307899fad2220ce5ab2f296e687eacde9", size = 4667123, upload-time = "2026-02-10T19:18:15.886Z" }, - { url = "https://files.pythonhosted.org/packages/45/2d/9c5f2926cb5300a8eefc3f4f0b3f3df39db7f7ce40c8365444c49363cbda/cryptography-46.0.5-cp38-abi3-win32.whl", hash = "sha256:02f547fce831f5096c9a567fd41bc12ca8f11df260959ecc7c3202555cc47a72", size = 3010220, upload-time = "2026-02-10T19:18:17.361Z" }, - { url = "https://files.pythonhosted.org/packages/48/ef/0c2f4a8e31018a986949d34a01115dd057bf536905dca38897bacd21fac3/cryptography-46.0.5-cp38-abi3-win_amd64.whl", hash = "sha256:556e106ee01aa13484ce9b0239bca667be5004efb0aabbed28d353df86445595", size = 3467050, upload-time = "2026-02-10T19:18:18.899Z" }, - { url = "https://files.pythonhosted.org/packages/eb/dd/2d9fdb07cebdf3d51179730afb7d5e576153c6744c3ff8fded23030c204e/cryptography-46.0.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:3b4995dc971c9fb83c25aa44cf45f02ba86f71ee600d81091c2f0cbae116b06c", size = 3476964, upload-time = "2026-02-10T19:18:20.687Z" }, - { url = "https://files.pythonhosted.org/packages/e9/6f/6cc6cc9955caa6eaf83660b0da2b077c7fe8ff9950a3c5e45d605038d439/cryptography-46.0.5-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:bc84e875994c3b445871ea7181d424588171efec3e185dced958dad9e001950a", size = 4218321, upload-time = "2026-02-10T19:18:22.349Z" }, - { url = "https://files.pythonhosted.org/packages/3e/5d/c4da701939eeee699566a6c1367427ab91a8b7088cc2328c09dbee940415/cryptography-46.0.5-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:2ae6971afd6246710480e3f15824ed3029a60fc16991db250034efd0b9fb4356", size = 4381786, upload-time = "2026-02-10T19:18:24.529Z" }, - { url = "https://files.pythonhosted.org/packages/ac/97/a538654732974a94ff96c1db621fa464f455c02d4bb7d2652f4edc21d600/cryptography-46.0.5-pp311-pypy311_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:d861ee9e76ace6cf36a6a89b959ec08e7bc2493ee39d07ffe5acb23ef46d27da", size = 4217990, upload-time = "2026-02-10T19:18:25.957Z" }, - { url = "https://files.pythonhosted.org/packages/ae/11/7e500d2dd3ba891197b9efd2da5454b74336d64a7cc419aa7327ab74e5f6/cryptography-46.0.5-pp311-pypy311_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:2b7a67c9cd56372f3249b39699f2ad479f6991e62ea15800973b956f4b73e257", size = 4381252, upload-time = "2026-02-10T19:18:27.496Z" }, - { url = "https://files.pythonhosted.org/packages/bc/58/6b3d24e6b9bc474a2dcdee65dfd1f008867015408a271562e4b690561a4d/cryptography-46.0.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:8456928655f856c6e1533ff59d5be76578a7157224dbd9ce6872f25055ab9ab7", size = 3407605, upload-time = "2026-02-10T19:18:29.233Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/1f/99/d1c90d6041656cc6ee229dc99cd67fd0cd5aec3c5f7d72fffc27cc750054/cryptography-49.0.0.tar.gz", hash = "sha256:f89660a348f4f78a92366240a61404e337586ef7f5909a2fef59ca88ef505493", size = 854345, upload-time = "2026-06-12T20:02:30.512Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9b/22/adf66990e63584a68dfb50c24f48a125c07b1699899381c8151e63ed458c/cryptography-49.0.0-cp311-abi3-macosx_11_0_arm64.whl", hash = "sha256:966fe0e9c67490071f14c0d2b1cb2dfb3023c5ce39457343931415f08382f2db", size = 4032100, upload-time = "2026-06-12T20:02:32.143Z" }, + { url = "https://files.pythonhosted.org/packages/09/41/3797cfaf69cae04a13ee78ebd83f0678d9c02b4779d21ce24445326f1a69/cryptography-49.0.0-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:36d1709f992593689b45bda411498d62c6e365f2ca00b84657d4dadd24de16db", size = 4692978, upload-time = "2026-06-12T20:01:21.305Z" }, + { url = "https://files.pythonhosted.org/packages/e6/8b/43011f7ebe515a8aa20d61f290a326cd890c2e738e16e59eaff8d9c3a412/cryptography-49.0.0-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0e959b578856a3924bc0cbb710fc12c387b9412a951389f3ca61704a9e25f325", size = 4716422, upload-time = "2026-06-12T20:01:48.566Z" }, + { url = "https://files.pythonhosted.org/packages/4a/91/01ce7303a4579e6d3a6abef01bd322848e9ea7a219adcabc5048b9033571/cryptography-49.0.0-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:53ecee2e23f7169b6117e99fc8a944e5e50f79e69758a83b52a00cb98ab2b2d2", size = 4700503, upload-time = "2026-06-12T20:02:47.091Z" }, + { url = "https://files.pythonhosted.org/packages/62/99/a2c95cf8293f07491e9e27c20cc4dcd18176d944e674679adeb1d0173fd6/cryptography-49.0.0-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:2eda353d8a27bcbcaa4cbed18994a74ab4d19a2ca897db188ea269ab9b71419b", size = 5309779, upload-time = "2026-06-12T20:02:08.987Z" }, + { url = "https://files.pythonhosted.org/packages/20/2c/0622f20ff02b2ef32558733443805dc82fd4c275be01b2d19d14676f3a1b/cryptography-49.0.0-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:2afe9051da7ae7bd5905da5a949280c7d2bb75682e188f650a9d0f2756b834c6", size = 4749683, upload-time = "2026-06-12T20:02:03.335Z" }, + { url = "https://files.pythonhosted.org/packages/a3/5b/c5246635d5fd3b64e0d45ae10e99fd32fe9676a79915ccfe5a61ba9af1a5/cryptography-49.0.0-cp311-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:0b82e28ee398a386f0807bba7884d30f25218855690f45115831bcce5d90822c", size = 4337874, upload-time = "2026-06-12T20:02:54.323Z" }, + { url = "https://files.pythonhosted.org/packages/6d/88/05563c7fe2e914e87d1a536d06fe83e66b4e1d95cb593e05aea375531da8/cryptography-49.0.0-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:ccac2bfebc306b862133e3bb71f3f6ee8bb525240089b2d952e4144b3a6d5da7", size = 4700283, upload-time = "2026-06-12T20:01:34.822Z" }, + { url = "https://files.pythonhosted.org/packages/c4/b6/d7696e4e890d6ae1469935164c9e5215c557671cb78d6e3f458ccceaa632/cryptography-49.0.0-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:d0527ce944105f257f605a827d6ebead966c752038b6e8656abb9c5edee6fc68", size = 5265844, upload-time = "2026-06-12T20:01:24.09Z" }, + { url = "https://files.pythonhosted.org/packages/a9/3c/f3ad17eecc1a57b0ba236dc01f90e783c51f4a2f35f64777cc4f47a184b2/cryptography-49.0.0-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:cbc77da8c523d5abd028635ba850a6966fcee2c82e2bf65a41d1d8afe0f98be9", size = 4749290, upload-time = "2026-06-12T20:01:30.848Z" }, + { url = "https://files.pythonhosted.org/packages/4f/01/339573cf1023163a400b0b5d16f6d507de413b9f60be6fd1b77feeaf6737/cryptography-49.0.0-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b87e65d263b3e5d3bb92a57e2a6638e2f31110fa7aa890c7b2dbba42248d0a3f", size = 4834612, upload-time = "2026-06-12T20:01:29.246Z" }, + { url = "https://files.pythonhosted.org/packages/71/fd/577302e213a1be9468f92d1afef66fcf1ef83d516819d9992ca547f592bd/cryptography-49.0.0-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:66ec79c3904820572d7e987abdf304281f141d37ad9a489b8e97066e7b9b6459", size = 4980804, upload-time = "2026-06-12T20:01:42.853Z" }, + { url = "https://files.pythonhosted.org/packages/1f/09/f42b1d190c5ba75f72062a387f8030d1d75f6ab035788f1d9c4b01de6525/cryptography-49.0.0-cp311-abi3-win_amd64.whl", hash = "sha256:e5dfc1e64de5677cec922ffa8da89c546d0415bf6efdf081842e5d44c84e1f0e", size = 3810026, upload-time = "2026-06-12T20:02:39.262Z" }, + { url = "https://files.pythonhosted.org/packages/ec/9e/db72b3ae7fc9cfad53e630e56c6ae83b9b6ff0bf3718ffb8012d20b3aabf/cryptography-49.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:73a205dce83953d131a4aa1e0fd917a2fd1c5b1eef251e9d7152efefcbf5caf7", size = 4013892, upload-time = "2026-06-12T20:02:10.735Z" }, + { url = "https://files.pythonhosted.org/packages/86/12/c48a424f38db03027be9f7ed5c7dc5de9933dbee992865f98b13727a009d/cryptography-49.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:196ecd6a36e4e9aa10270393bb98d8df88fccee0bf1e5128b91ae4eb4375896d", size = 4678835, upload-time = "2026-06-12T20:02:48.743Z" }, + { url = "https://files.pythonhosted.org/packages/68/28/8a3ad4653662c93fc44dc4e5d8fd374c25c42e07b34bbfbadf49cf57a5a8/cryptography-49.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7abcee80084cda3f7691f3eb1ce480d8df49cec637b429aa35986c1de71738aa", size = 4697239, upload-time = "2026-06-12T20:02:56.03Z" }, + { url = "https://files.pythonhosted.org/packages/a8/b2/2193fc74f81aee4f9b62733133b73b5176718932ed8f2e4b03fa040480a6/cryptography-49.0.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:4ae387c9cb68ea569ca17e490d66d8142b81c3cc814bf179974b7d146e490bbb", size = 4685593, upload-time = "2026-06-12T20:02:50.666Z" }, + { url = "https://files.pythonhosted.org/packages/47/f1/1d3eaa243bfc5de4a187b22aa8c048b3e4980bfbe830ac46e6bac2e66947/cryptography-49.0.0-cp314-cp314t-manylinux_2_28_ppc64le.whl", hash = "sha256:f37d847238971164fdbc68ade6f6574aecc9c0af714190e2083429ff68f4ce9d", size = 5289961, upload-time = "2026-06-12T20:01:46.468Z" }, + { url = "https://files.pythonhosted.org/packages/58/39/2d51306721330c486495853eda1c567880ff036de15a14c4b74f399934af/cryptography-49.0.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:c2bc30226390d60ea19d9f82b19db005fe0452154a23c1c410c12ea801e43561", size = 4731145, upload-time = "2026-06-12T20:02:16.832Z" }, + { url = "https://files.pythonhosted.org/packages/17/50/983e838c7fd0d87fd8c969bcdd328edaf5f756e38df5281637424c155873/cryptography-49.0.0-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:07cab27cc7b7e0fd28e5e26bb9eeedde5c135c868b46de4a27845abe94af6122", size = 4321719, upload-time = "2026-06-12T20:02:52.611Z" }, + { url = "https://files.pythonhosted.org/packages/a7/f5/8f571d7e27c55bce9f76f026143bcb1e040a4233149ecca0bea5fa5dd5f7/cryptography-49.0.0-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:b20133d204d2bb56ba047642199603876c872026ca53e79c35b83772ab2cc505", size = 4685209, upload-time = "2026-06-12T20:02:07.282Z" }, + { url = "https://files.pythonhosted.org/packages/e7/84/0e27016a6fc5a0886f797018b26aa42f40c09a82332bff77822a451deaaa/cryptography-49.0.0-cp314-cp314t-manylinux_2_34_ppc64le.whl", hash = "sha256:b970c6da94d5bb18629db453d14f2a1300f6bf59b61e9b82377931ef95504866", size = 5246285, upload-time = "2026-06-12T20:01:32.439Z" }, + { url = "https://files.pythonhosted.org/packages/11/2d/5e1fb307cb5931881516b464c98774b3f2c36b5d4bb9a2830253cf553cad/cryptography-49.0.0-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:d8ecde755e2e91bf773fc94e8c9d730cd7f2007004cb492263a794ec3899a1c8", size = 4730441, upload-time = "2026-06-12T20:02:01.469Z" }, + { url = "https://files.pythonhosted.org/packages/e4/c0/bff5a02ee731d207d6a1ed51732549d8c53d2bc8da1d10ec6f2844201d68/cryptography-49.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e3fb64c420688e5319ae25113a354015abbd8dffbfbc41781a1ea66fc7622ac3", size = 4815869, upload-time = "2026-06-12T20:01:36.574Z" }, + { url = "https://files.pythonhosted.org/packages/b9/26/814681d14248d95d73d5c3eea0c39a94eb8302df966f670a2c60de90974b/cryptography-49.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:32703d93296f5c1f4b53349ad3a250c2cae0fdecd3a3dd5d47e616d8d616af27", size = 4960948, upload-time = "2026-06-12T20:02:18.688Z" }, + { url = "https://files.pythonhosted.org/packages/4c/fe/93ecac273d3738939d023612ad12cca9a3740a5345d69fda04134c43fd96/cryptography-49.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:33cd0565932807baddb67b96dbee92f2c374b5c89dee09fd74079aeb8c8dba61", size = 3799153, upload-time = "2026-06-12T20:01:39.059Z" }, + { url = "https://files.pythonhosted.org/packages/19/2a/5bb823f5bedcf80718cea7fbc95ec5515cca3769633c4b01a32be7f30e7c/cryptography-49.0.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:ec5e529fb80935c94fe7b729f9972b50e351a0e6b50aa294fd5cabb109fcc29a", size = 4025947, upload-time = "2026-06-12T20:01:25.745Z" }, + { url = "https://files.pythonhosted.org/packages/3d/df/40577043ca124e17012f408ddddaeb213b856336ac82ddb3bc915f39e29f/cryptography-49.0.0-cp39-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f78ff2c9ed8dc2d036b0f4d640e22522213d047c1b14e61205a7e55c80a494d4", size = 4692429, upload-time = "2026-06-12T20:01:53.628Z" }, + { url = "https://files.pythonhosted.org/packages/2c/99/2d13299eb3dd27b02dcfaafcc91d6b5cb3329f7cbd6d8f51921acd566c1a/cryptography-49.0.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:35b151772baff2c74cba7fa290ceaff4c3b11c0c881eb93eb5dbc05a7cfbba18", size = 4700968, upload-time = "2026-06-12T20:02:45.383Z" }, + { url = "https://files.pythonhosted.org/packages/a5/4d/9c0cd02f95e2602dd5e563da149ee0830abef3537be8b34dc56281ebe27a/cryptography-49.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:0f21641cf4b30fca7aee061ced0ec7ad7b073518088b7c9969a297c0ae796c69", size = 4697758, upload-time = "2026-06-12T20:01:41.13Z" }, + { url = "https://files.pythonhosted.org/packages/24/01/186c825898477d77e2324d5360fefe622ff1d8d1963ec0554e2cada8ec77/cryptography-49.0.0-cp39-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:9e82dcc8e56052715fb18b2429e3bca4823b1629136a2084fc45a9a5cecb9b64", size = 5298863, upload-time = "2026-06-12T20:02:24.579Z" }, + { url = "https://files.pythonhosted.org/packages/b8/7b/62cbbab75d0659865bf0273790031544a0b16c8072d258f9428dcd8190dc/cryptography-49.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:6f2debedf9ca60cf1d5bd466475638af5130f89965605cd818484d19987d3a21", size = 4735983, upload-time = "2026-06-12T20:01:50.14Z" }, + { url = "https://files.pythonhosted.org/packages/6c/72/3e798c064bc39e471008075d0f9bc9daf77a80879c092e4a8e170c585ed4/cryptography-49.0.0-cp39-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:8c25ceb16df5b9435f3f6a9829204985b0e0cbee3b48aacd432c7d2c850b44d9", size = 4334173, upload-time = "2026-06-12T20:01:44.743Z" }, + { url = "https://files.pythonhosted.org/packages/f0/ee/6fca21d1ac73e06f8bef71940abfd4d2f6472b4bca284d770f32bd4086f6/cryptography-49.0.0-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:28d8b15e6275f12c8a207dc309dfa957903c927d08d0cc937ee3f63f200693cc", size = 4697298, upload-time = "2026-06-12T20:02:20.918Z" }, + { url = "https://files.pythonhosted.org/packages/67/d0/a5fcd3515f0bae49a7b6d0413cc1bdccdcc1fc0047037a0d480642cdc5d6/cryptography-49.0.0-cp39-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:6fc361c34fb6aac015ce19435876635e5c6d21db31998b0920f675f131e043b8", size = 5254338, upload-time = "2026-06-12T20:02:22.737Z" }, + { url = "https://files.pythonhosted.org/packages/a0/84/84fe36f19caf857d61cb7fc9c63035a47ffabd84ea12d1d393148efa3615/cryptography-49.0.0-cp39-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:2400ef9c9e2299a25614eb1dea3db54a69b1349efd043bfac9c67630d136df36", size = 4735650, upload-time = "2026-06-12T20:02:41.389Z" }, + { url = "https://files.pythonhosted.org/packages/6c/a0/db537264e234f7273a73ec020873d6d6b39dfd8a53db78b550ca8320440e/cryptography-49.0.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:67e1d20ad9ef3a563c59ef22e7a8a0b8210bd26604369ea4a30a7c66aefe504e", size = 4834820, upload-time = "2026-06-12T20:01:51.847Z" }, + { url = "https://files.pythonhosted.org/packages/93/77/8df9eb486495979bccecd1062e2eaf435250e84437040295b57d09048b0b/cryptography-49.0.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:42b0684e0e40cf26122427802486f6d93aea593612603a94fbf260c7eb1e9c1b", size = 4967968, upload-time = "2026-06-12T20:02:12.524Z" }, + { url = "https://files.pythonhosted.org/packages/c2/e6/f60198ea8d9dfa15fff9ed4ca02ce362f6eadd9ba757dcc50634c4257b63/cryptography-49.0.0-cp39-abi3-win_amd64.whl", hash = "sha256:026ac7423e6fa66872d3bf889be5974507da3944f866f704fa200eadacd00001", size = 3785547, upload-time = "2026-06-12T20:02:26.847Z" }, + { url = "https://files.pythonhosted.org/packages/63/d3/4a83af35d65e3fad632c926fad684c193ea4398569ccb0bbbc7fe8f5dc9a/cryptography-49.0.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:fc1e275c2f1d97b1a6450b8b0ea3ebfa6e087a611c2b26cb2404d48588abab7b", size = 3993685, upload-time = "2026-06-12T20:02:14.883Z" }, + { url = "https://files.pythonhosted.org/packages/d6/a7/f9dac0ab7f80368c56993a7bf638ef9935f825c91902798481fac0898138/cryptography-49.0.0-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c83782480a4a9da4d0feb51950131ba32e12e70813848b3343f6e18c28a66838", size = 4676239, upload-time = "2026-06-12T20:02:28.793Z" }, + { url = "https://files.pythonhosted.org/packages/d7/70/2ba3769dd0ae167e2f33dfa9592d45db6ff9a61d62ca1a5b3d1bdd09068f/cryptography-49.0.0-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:b39efa323140595abd3ecca8529d321ae50f55f3aa3ba9cc81ea56a6011953d5", size = 4715584, upload-time = "2026-06-12T20:01:27.495Z" }, + { url = "https://files.pythonhosted.org/packages/94/64/2923570ac1c0bd3a737aa366ac3abbbbde273042308b8cde95e2364a6e6a/cryptography-49.0.0-pp311-pypy311_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:b47db11c2c3525083296069b98ac5221907455e989ae0c2e3008bde851921615", size = 4675885, upload-time = "2026-06-12T20:01:55.49Z" }, + { url = "https://files.pythonhosted.org/packages/ab/f8/614dc7e051418cfe53d55173c1e24c6b0085e89996fe90508c2fdf769aef/cryptography-49.0.0-pp311-pypy311_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:084ef1af862eb07ec46d25f68689f2102a9fc0e05ce7b80f14f5fe51e4eef0f6", size = 4715449, upload-time = "2026-06-12T20:02:05.469Z" }, + { url = "https://files.pythonhosted.org/packages/aa/50/a9caea39ad19c431c1a3f8a31114df65b260cdfe67786b6c7e7c040c4c44/cryptography-49.0.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:be9fcb48a55f023493482827d4f459bd263cc20efde64f204b97c123201850c6", size = 3783731, upload-time = "2026-06-12T20:02:43.319Z" }, ] [[package]] @@ -1559,7 +1556,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/38/3f/9859f655d11901e7b2996c6e3d33e0caa9a1d4572c3bc61ed0faa64b2f4c/greenlet-3.3.2-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:9bc885b89709d901859cf95179ec9f6bb67a3d2bb1f0e88456461bd4b7f8fd0d", size = 277747, upload-time = "2026-02-20T20:16:21.325Z" }, { url = "https://files.pythonhosted.org/packages/fb/07/cb284a8b5c6498dbd7cba35d31380bb123d7dceaa7907f606c8ff5993cbf/greenlet-3.3.2-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b568183cf65b94919be4438dc28416b234b678c608cafac8874dfeeb2a9bbe13", size = 579202, upload-time = "2026-02-20T20:47:28.955Z" }, { url = "https://files.pythonhosted.org/packages/ed/45/67922992b3a152f726163b19f890a85129a992f39607a2a53155de3448b8/greenlet-3.3.2-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:527fec58dc9f90efd594b9b700662ed3fb2493c2122067ac9c740d98080a620e", size = 590620, upload-time = "2026-02-20T20:55:55.581Z" }, - { url = "https://files.pythonhosted.org/packages/03/5f/6e2a7d80c353587751ef3d44bb947f0565ec008a2e0927821c007e96d3a7/greenlet-3.3.2-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:508c7f01f1791fbc8e011bd508f6794cb95397fdb198a46cb6635eb5b78d85a7", size = 602132, upload-time = "2026-02-20T21:02:43.261Z" }, { url = "https://files.pythonhosted.org/packages/ad/55/9f1ebb5a825215fadcc0f7d5073f6e79e3007e3282b14b22d6aba7ca6cb8/greenlet-3.3.2-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ad0c8917dd42a819fe77e6bdfcb84e3379c0de956469301d9fd36427a1ca501f", size = 591729, upload-time = "2026-02-20T20:20:58.395Z" }, { url = "https://files.pythonhosted.org/packages/24/b4/21f5455773d37f94b866eb3cf5caed88d6cea6dd2c6e1f9c34f463cba3ec/greenlet-3.3.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:97245cc10e5515dbc8c3104b2928f7f02b6813002770cfaffaf9a6e0fc2b94ef", size = 1551946, upload-time = "2026-02-20T20:49:31.102Z" }, { url = "https://files.pythonhosted.org/packages/00/68/91f061a926abead128fe1a87f0b453ccf07368666bd59ffa46016627a930/greenlet-3.3.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8c1fdd7d1b309ff0da81d60a9688a8bd044ac4e18b250320a96fc68d31c209ca", size = 1618494, upload-time = "2026-02-20T20:21:06.541Z" }, @@ -1567,7 +1563,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f3/47/16400cb42d18d7a6bb46f0626852c1718612e35dcb0dffa16bbaffdf5dd2/greenlet-3.3.2-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:c56692189a7d1c7606cb794be0a8381470d95c57ce5be03fb3d0ef57c7853b86", size = 278890, upload-time = "2026-02-20T20:19:39.263Z" }, { url = "https://files.pythonhosted.org/packages/a3/90/42762b77a5b6aa96cd8c0e80612663d39211e8ae8a6cd47c7f1249a66262/greenlet-3.3.2-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1ebd458fa8285960f382841da585e02201b53a5ec2bac6b156fc623b5ce4499f", size = 581120, upload-time = "2026-02-20T20:47:30.161Z" }, { url = "https://files.pythonhosted.org/packages/bf/6f/f3d64f4fa0a9c7b5c5b3c810ff1df614540d5aa7d519261b53fba55d4df9/greenlet-3.3.2-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a443358b33c4ec7b05b79a7c8b466f5d275025e750298be7340f8fc63dff2a55", size = 594363, upload-time = "2026-02-20T20:55:56.965Z" }, - { url = "https://files.pythonhosted.org/packages/9c/8b/1430a04657735a3f23116c2e0d5eb10220928846e4537a938a41b350bed6/greenlet-3.3.2-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4375a58e49522698d3e70cc0b801c19433021b5c37686f7ce9c65b0d5c8677d2", size = 605046, upload-time = "2026-02-20T21:02:45.234Z" }, { url = "https://files.pythonhosted.org/packages/72/83/3e06a52aca8128bdd4dcd67e932b809e76a96ab8c232a8b025b2850264c5/greenlet-3.3.2-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8e2cd90d413acbf5e77ae41e5d3c9b3ac1d011a756d7284d7f3f2b806bbd6358", size = 594156, upload-time = "2026-02-20T20:20:59.955Z" }, { url = "https://files.pythonhosted.org/packages/70/79/0de5e62b873e08fe3cef7dbe84e5c4bc0e8ed0c7ff131bccb8405cd107c8/greenlet-3.3.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:442b6057453c8cb29b4fb36a2ac689382fc71112273726e2423f7f17dc73bf99", size = 1554649, upload-time = "2026-02-20T20:49:32.293Z" }, { url = "https://files.pythonhosted.org/packages/5a/00/32d30dee8389dc36d42170a9c66217757289e2afb0de59a3565260f38373/greenlet-3.3.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:45abe8eb6339518180d5a7fa47fa01945414d7cca5ecb745346fc6a87d2750be", size = 1619472, upload-time = "2026-02-20T20:21:07.966Z" }, @@ -1576,7 +1571,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ea/ab/1608e5a7578e62113506740b88066bf09888322a311cff602105e619bd87/greenlet-3.3.2-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:ac8d61d4343b799d1e526db579833d72f23759c71e07181c2d2944e429eb09cd", size = 280358, upload-time = "2026-02-20T20:17:43.971Z" }, { url = "https://files.pythonhosted.org/packages/a5/23/0eae412a4ade4e6623ff7626e38998cb9b11e9ff1ebacaa021e4e108ec15/greenlet-3.3.2-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3ceec72030dae6ac0c8ed7591b96b70410a8be370b6a477b1dbc072856ad02bd", size = 601217, upload-time = "2026-02-20T20:47:31.462Z" }, { url = "https://files.pythonhosted.org/packages/f8/16/5b1678a9c07098ecb9ab2dd159fafaf12e963293e61ee8d10ecb55273e5e/greenlet-3.3.2-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a2a5be83a45ce6188c045bcc44b0ee037d6a518978de9a5d97438548b953a1ac", size = 611792, upload-time = "2026-02-20T20:55:58.423Z" }, - { url = "https://files.pythonhosted.org/packages/5c/c5/cc09412a29e43406eba18d61c70baa936e299bc27e074e2be3806ed29098/greenlet-3.3.2-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ae9e21c84035c490506c17002f5c8ab25f980205c3e61ddb3a2a2a2e6c411fcb", size = 626250, upload-time = "2026-02-20T21:02:46.596Z" }, { url = "https://files.pythonhosted.org/packages/50/1f/5155f55bd71cabd03765a4aac9ac446be129895271f73872c36ebd4b04b6/greenlet-3.3.2-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:43e99d1749147ac21dde49b99c9abffcbc1e2d55c67501465ef0930d6e78e070", size = 613875, upload-time = "2026-02-20T20:21:01.102Z" }, { url = "https://files.pythonhosted.org/packages/fc/dd/845f249c3fcd69e32df80cdab059b4be8b766ef5830a3d0aa9d6cad55beb/greenlet-3.3.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4c956a19350e2c37f2c48b336a3afb4bff120b36076d9d7fb68cb44e05d95b79", size = 1571467, upload-time = "2026-02-20T20:49:33.495Z" }, { url = "https://files.pythonhosted.org/packages/2a/50/2649fe21fcc2b56659a452868e695634722a6655ba245d9f77f5656010bf/greenlet-3.3.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6c6f8ba97d17a1e7d664151284cb3315fc5f8353e75221ed4324f84eb162b395", size = 1640001, upload-time = "2026-02-20T20:21:09.154Z" }, @@ -1585,7 +1579,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ac/48/f8b875fa7dea7dd9b33245e37f065af59df6a25af2f9561efa8d822fde51/greenlet-3.3.2-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:aa6ac98bdfd716a749b84d4034486863fd81c3abde9aa3cf8eff9127981a4ae4", size = 279120, upload-time = "2026-02-20T20:19:01.9Z" }, { url = "https://files.pythonhosted.org/packages/49/8d/9771d03e7a8b1ee456511961e1b97a6d77ae1dea4a34a5b98eee706689d3/greenlet-3.3.2-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ab0c7e7901a00bc0a7284907273dc165b32e0d109a6713babd04471327ff7986", size = 603238, upload-time = "2026-02-20T20:47:32.873Z" }, { url = "https://files.pythonhosted.org/packages/59/0e/4223c2bbb63cd5c97f28ffb2a8aee71bdfb30b323c35d409450f51b91e3e/greenlet-3.3.2-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d248d8c23c67d2291ffd47af766e2a3aa9fa1c6703155c099feb11f526c63a92", size = 614219, upload-time = "2026-02-20T20:55:59.817Z" }, - { url = "https://files.pythonhosted.org/packages/94/2b/4d012a69759ac9d77210b8bfb128bc621125f5b20fc398bce3940d036b1c/greenlet-3.3.2-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ccd21bb86944ca9be6d967cf7691e658e43417782bce90b5d2faeda0ff78a7dd", size = 628268, upload-time = "2026-02-20T21:02:48.024Z" }, { url = "https://files.pythonhosted.org/packages/7a/34/259b28ea7a2a0c904b11cd36c79b8cef8019b26ee5dbe24e73b469dea347/greenlet-3.3.2-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b6997d360a4e6a4e936c0f9625b1c20416b8a0ea18a8e19cabbefc712e7397ab", size = 616774, upload-time = "2026-02-20T20:21:02.454Z" }, { url = "https://files.pythonhosted.org/packages/0a/03/996c2d1689d486a6e199cb0f1cf9e4aa940c500e01bdf201299d7d61fa69/greenlet-3.3.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:64970c33a50551c7c50491671265d8954046cb6e8e2999aacdd60e439b70418a", size = 1571277, upload-time = "2026-02-20T20:49:34.795Z" }, { url = "https://files.pythonhosted.org/packages/d9/c4/2570fc07f34a39f2caf0bf9f24b0a1a0a47bc2e8e465b2c2424821389dfc/greenlet-3.3.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:1a9172f5bf6bd88e6ba5a84e0a68afeac9dc7b6b412b245dd64f52d83c81e55b", size = 1640455, upload-time = "2026-02-20T20:21:10.261Z" }, @@ -1594,7 +1587,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3f/ae/8bffcbd373b57a5992cd077cbe8858fff39110480a9d50697091faea6f39/greenlet-3.3.2-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:8d1658d7291f9859beed69a776c10822a0a799bc4bfe1bd4272bb60e62507dab", size = 279650, upload-time = "2026-02-20T20:18:00.783Z" }, { url = "https://files.pythonhosted.org/packages/d1/c0/45f93f348fa49abf32ac8439938726c480bd96b2a3c6f4d949ec0124b69f/greenlet-3.3.2-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:18cb1b7337bca281915b3c5d5ae19f4e76d35e1df80f4ad3c1a7be91fadf1082", size = 650295, upload-time = "2026-02-20T20:47:34.036Z" }, { url = "https://files.pythonhosted.org/packages/b3/de/dd7589b3f2b8372069ab3e4763ea5329940fc7ad9dcd3e272a37516d7c9b/greenlet-3.3.2-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c2e47408e8ce1c6f1ceea0dffcdf6ebb85cc09e55c7af407c99f1112016e45e9", size = 662163, upload-time = "2026-02-20T20:56:01.295Z" }, - { url = "https://files.pythonhosted.org/packages/cd/ac/85804f74f1ccea31ba518dcc8ee6f14c79f73fe36fa1beba38930806df09/greenlet-3.3.2-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:e3cb43ce200f59483eb82949bf1835a99cf43d7571e900d7c8d5c62cdf25d2f9", size = 675371, upload-time = "2026-02-20T21:02:49.664Z" }, { url = "https://files.pythonhosted.org/packages/d2/d8/09bfa816572a4d83bccd6750df1926f79158b1c36c5f73786e26dbe4ee38/greenlet-3.3.2-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:63d10328839d1973e5ba35e98cccbca71b232b14051fd957b6f8b6e8e80d0506", size = 664160, upload-time = "2026-02-20T20:21:04.015Z" }, { url = "https://files.pythonhosted.org/packages/48/cf/56832f0c8255d27f6c35d41b5ec91168d74ec721d85f01a12131eec6b93c/greenlet-3.3.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:8e4ab3cfb02993c8cc248ea73d7dae6cec0253e9afa311c9b37e603ca9fad2ce", size = 1619181, upload-time = "2026-02-20T20:49:36.052Z" }, { url = "https://files.pythonhosted.org/packages/0a/23/b90b60a4aabb4cec0796e55f25ffbfb579a907c3898cd2905c8918acaa16/greenlet-3.3.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:94ad81f0fd3c0c0681a018a976e5c2bd2ca2d9d94895f23e7bb1af4e8af4e2d5", size = 1687713, upload-time = "2026-02-20T20:21:11.684Z" }, @@ -1603,7 +1595,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/98/6d/8f2ef704e614bcf58ed43cfb8d87afa1c285e98194ab2cfad351bf04f81e/greenlet-3.3.2-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:e26e72bec7ab387ac80caa7496e0f908ff954f31065b0ffc1f8ecb1338b11b54", size = 286617, upload-time = "2026-02-20T20:19:29.856Z" }, { url = "https://files.pythonhosted.org/packages/5e/0d/93894161d307c6ea237a43988f27eba0947b360b99ac5239ad3fe09f0b47/greenlet-3.3.2-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b466dff7a4ffda6ca975979bab80bdadde979e29fc947ac3be4451428d8b0e4", size = 655189, upload-time = "2026-02-20T20:47:35.742Z" }, { url = "https://files.pythonhosted.org/packages/f5/2c/d2d506ebd8abcb57386ec4f7ba20f4030cbe56eae541bc6fd6ef399c0b41/greenlet-3.3.2-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b8bddc5b73c9720bea487b3bffdb1840fe4e3656fba3bd40aa1489e9f37877ff", size = 658225, upload-time = "2026-02-20T20:56:02.527Z" }, - { url = "https://files.pythonhosted.org/packages/d1/67/8197b7e7e602150938049d8e7f30de1660cfb87e4c8ee349b42b67bdb2e1/greenlet-3.3.2-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:59b3e2c40f6706b05a9cd299c836c6aa2378cabe25d021acd80f13abf81181cf", size = 666581, upload-time = "2026-02-20T21:02:51.526Z" }, { url = "https://files.pythonhosted.org/packages/8e/30/3a09155fbf728673a1dea713572d2d31159f824a37c22da82127056c44e4/greenlet-3.3.2-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b26b0f4428b871a751968285a1ac9648944cea09807177ac639b030bddebcea4", size = 657907, upload-time = "2026-02-20T20:21:05.259Z" }, { url = "https://files.pythonhosted.org/packages/f3/fd/d05a4b7acd0154ed758797f0a43b4c0962a843bedfe980115e842c5b2d08/greenlet-3.3.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:1fb39a11ee2e4d94be9a76671482be9398560955c9e568550de0224e41104727", size = 1618857, upload-time = "2026-02-20T20:49:37.309Z" }, { url = "https://files.pythonhosted.org/packages/6f/e1/50ee92a5db521de8f35075b5eff060dd43d39ebd46c2181a2042f7070385/greenlet-3.3.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:20154044d9085151bc309e7689d6f7ba10027f8f5a8c0676ad398b951913d89e", size = 1680010, upload-time = "2026-02-20T20:21:13.427Z" }, @@ -3528,6 +3519,7 @@ dependencies = [ { name = "opentelemetry-exporter-otlp-proto-http" }, { name = "opentelemetry-instrumentation-asyncio" }, { name = "opentelemetry-sdk" }, + { name = "openviking-sdk" }, { name = "pathspec" }, { name = "pdfminer-six" }, { name = "pdfplumber" }, @@ -3735,7 +3727,7 @@ requires-dist = [ { name = "build", marker = "extra == 'build'" }, { name = "cmake", marker = "extra == 'build'", specifier = ">=3.15" }, { name = "croniter", marker = "extra == 'bot'", specifier = ">=2.0.0" }, - { name = "cryptography", specifier = ">=42.0.0" }, + { name = "cryptography", specifier = ">=48.0.1" }, { name = "datasets", marker = "extra == 'benchmark'", specifier = ">=2.0.0" }, { name = "datasets", marker = "extra == 'eval'", specifier = ">=2.0.0" }, { name = "datasets", marker = "extra == 'test'", specifier = ">=2.0.0" }, @@ -3785,6 +3777,7 @@ requires-dist = [ { name = "opentelemetry-instrumentation-asyncio", specifier = ">=0.61b0" }, { name = "opentelemetry-sdk", specifier = ">=1.14" }, { name = "openviking", extras = ["bot", "bot-dingtalk", "bot-feishu", "bot-fuse", "bot-langfuse", "bot-opencode", "bot-qq", "bot-sandbox", "bot-slack", "bot-telegram"], marker = "extra == 'bot-full'" }, + { name = "openviking-sdk", specifier = ">=0.1.1" }, { name = "pandas", marker = "extra == 'benchmark'", specifier = ">=2.0.0" }, { name = "pandas", marker = "extra == 'eval'", specifier = ">=2.0.0" }, { name = "pandas", marker = "extra == 'test'", specifier = ">=2.0.0" }, @@ -3804,7 +3797,7 @@ requires-dist = [ { name = "pytest-cov", marker = "extra == 'test'", specifier = ">=4.0.0" }, { name = "pytest-xdist", marker = "extra == 'test'", specifier = ">=3.5.0" }, { name = "python-docx", specifier = ">=1.0.0" }, - { name = "python-multipart", specifier = ">=0.0.27" }, + { name = "python-multipart", specifier = ">=0.0.31" }, { name = "python-pptx", specifier = ">=1.0.0" }, { name = "python-socketio", marker = "extra == 'bot'", specifier = ">=5.11.0" }, { name = "python-socks", extras = ["asyncio"], marker = "extra == 'bot'", specifier = ">=2.4.0" }, @@ -3856,6 +3849,18 @@ provides-extras = ["test", "opengauss", "dev", "doc", "eval", "gemini", "gemini- [package.metadata.requires-dev] dev = [{ name = "pytest", specifier = ">=9.0.2" }] +[[package]] +name = "openviking-sdk" +version = "0.1.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "httpx" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b0/d2/1638f2d9592a87e2350a5d663db46b2bab8fff57b79f688a3ed63da069a1/openviking_sdk-0.1.2.tar.gz", hash = "sha256:673370c7df89fa7f7c6708e05251450e1cb736c5b94d1ba87ff18cd40d51eff8", size = 26700, upload-time = "2026-06-22T06:12:06.241Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2d/c7/f21f7899a8902bbc9b2f8bfc6f60eb0b64addf44fc71b76f0c72241183f6/openviking_sdk-0.1.2-py3-none-any.whl", hash = "sha256:9e4c719d0f3f84dd686ffce45b80e8730c815ce6e4da94b94416307c679caa5f", size = 16904, upload-time = "2026-06-22T06:12:04.866Z" }, +] + [[package]] name = "orjson" version = "3.11.7" @@ -5008,11 +5013,11 @@ wheels = [ [[package]] name = "python-multipart" -version = "0.0.27" +version = "0.0.32" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/69/9b/f23807317a113dc36e74e75eb265a02dd1a4d9082abc3c1064acd22997c4/python_multipart-0.0.27.tar.gz", hash = "sha256:9870a6a8c5a20a5bf4f07c017bd1489006ff8836cff097b6933355ee2b49b602", size = 44043, upload-time = "2026-04-27T10:51:26.649Z" } +sdist = { url = "https://files.pythonhosted.org/packages/5b/42/55c32bb9b12693c092ad250a0e82edb5b31ddeda6eb772de5f308b3804ad/python_multipart-0.0.32.tar.gz", hash = "sha256:be54b7f3fa167bb83e4fcd936b887b708f4e57fe75911c02aebf53efaf8d938e", size = 46881, upload-time = "2026-06-04T16:18:58.647Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/99/78/4126abcbdbd3c559d43e0db7f7b9173fc6befe45d39a2856cc0b8ec2a5a6/python_multipart-0.0.27-py3-none-any.whl", hash = "sha256:6fccfad17a27334bd0193681b369f476eda3409f17381a2d65aa7df3f7275645", size = 29254, upload-time = "2026-04-27T10:51:24.997Z" }, + { url = "https://files.pythonhosted.org/packages/e1/04/e8135ebd1ad02c56ec633277529b2602ff99ff634be76cdba5744cf554fd/python_multipart-0.0.32-py3-none-any.whl", hash = "sha256:ff6d3f776f16878c894e52e107296ffc890e913c611b1a4ec6c44e2821fe2e23", size = 30042, upload-time = "2026-06-04T16:18:57.319Z" }, ] [[package]] From 8f035826b63ea95a16d0d121c1ff5dc9da7adbc8 Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Tue, 23 Jun 2026 20:07:20 +0800 Subject: [PATCH 29/31] fix: default values in docs --- docs/en/api/03-filesystem.md | 4 ++-- docs/en/api/06-retrieval.md | 8 ++++---- docs/zh/api/03-filesystem.md | 4 ++-- docs/zh/api/06-retrieval.md | 8 ++++---- examples/ov.conf.example | 2 +- .../skills/ov-resources/docs/filesystem.md | 2 +- .../skills/ov-resources/examples/commands.md | 2 +- openviking/storage/viking_fs.py | 8 ++++---- tests/storage/test_viking_fs_grep.py | 20 +++++++++++++++++++ 9 files changed, 39 insertions(+), 19 deletions(-) diff --git a/docs/en/api/03-filesystem.md b/docs/en/api/03-filesystem.md index 446989ded5..2778ab2c5a 100644 --- a/docs/en/api/03-filesystem.md +++ b/docs/en/api/03-filesystem.md @@ -817,7 +817,7 @@ Search content by pattern. | case_insensitive | bool | No | False | Ignore case | | exclude_uri | str | No | None | URI prefix to exclude from search | | node_limit | int | No | None | Maximum number of results | -| level_limit | int | No | 5 | Maximum directory depth to traverse | +| level_limit | int | No | Python SDK: 5; HTTP API / CLI / Go SDK: 10 | Maximum directory depth to traverse. The Go SDK currently uses the HTTP API default. | **Python SDK (Embedded / HTTP)** @@ -866,7 +866,7 @@ curl -X POST http://localhost:1933/api/v1/search/grep \ **CLI** ```bash -openviking grep viking://resources/ "authentication" [--ignore-case] +openviking grep "authentication" --uri viking://resources/ [--ignore-case] ``` **Response** diff --git a/docs/en/api/06-retrieval.md b/docs/en/api/06-retrieval.md index a38647dc57..b159f3a8a3 100644 --- a/docs/en/api/06-retrieval.md +++ b/docs/en/api/06-retrieval.md @@ -533,7 +533,7 @@ The `grep()` method performs regex pattern matching search in the file system, u | case_insensitive | bool | No | False | Ignore case | | exclude_uri | str | No | None | URI prefix to exclude from search | | node_limit | int | No | None | Maximum number of results | -| level_limit | int | No | 5 | Maximum directory depth to traverse | +| level_limit | int | No | Python SDK: 5; HTTP API / CLI / Go SDK: 10 | Maximum directory depth to traverse. The Go SDK currently uses the HTTP API default. | #### 3. Usage Examples @@ -590,13 +590,13 @@ fmt.Println(result["count"]) ```bash # Basic search -openviking grep viking://resources "authentication" +openviking grep "authentication" --uri viking://resources # Ignore case -openviking grep viking://resources "authentication" --ignore-case +openviking grep "authentication" --uri viking://resources --ignore-case # Specify depth limit -openviking grep viking://resources "TODO" --level-limit 3 +openviking grep "TODO" --uri viking://resources --level-limit 3 ``` **Response Example** diff --git a/docs/zh/api/03-filesystem.md b/docs/zh/api/03-filesystem.md index 4e5509e0c9..45288e6633 100644 --- a/docs/zh/api/03-filesystem.md +++ b/docs/zh/api/03-filesystem.md @@ -818,7 +818,7 @@ openviking mv viking://resources/old-name/ viking://resources/new-name/ | case_insensitive | bool | 否 | False | 忽略大小写 | | exclude_uri | str | 否 | None | 搜索时要排除的 URI 前缀 | | node_limit | int | 否 | None | 最大返回节点数 | -| level_limit | int | 否 | 5 | 最大目录遍历深度 | +| level_limit | int | 否 | Python SDK: 5;HTTP API / CLI / Go SDK: 10 | 最大目录遍历深度。Go SDK 当前使用 HTTP API 默认值。 | **Python SDK (Embedded / HTTP)** @@ -867,7 +867,7 @@ curl -X POST http://localhost:1933/api/v1/search/grep \ **CLI** ```bash -openviking grep viking://resources/ "authentication" [--ignore-case] +openviking grep "authentication" --uri viking://resources/ [--ignore-case] ``` **响应** diff --git a/docs/zh/api/06-retrieval.md b/docs/zh/api/06-retrieval.md index 40fb0400ba..1146473da8 100644 --- a/docs/zh/api/06-retrieval.md +++ b/docs/zh/api/06-retrieval.md @@ -535,7 +535,7 @@ openviking search "how to implement OAuth" -L 1,2 | case_insensitive | bool | 否 | False | 忽略大小写 | | node_limit | int | 否 | None | 最大返回节点数 | | exclude_uri | str | 否 | None | 要排除在搜索之外的 URI 前缀 | -| level_limit | int | 否 | 5 | 最大目录遍历深度 | +| level_limit | int | 否 | Python SDK: 5;HTTP API / CLI / Go SDK: 10 | 最大目录遍历深度。Go SDK 当前使用 HTTP API 默认值。 | #### 3. 使用示例 @@ -592,13 +592,13 @@ fmt.Println(result["count"]) ```bash # 基础搜索 -openviking grep viking://resources "authentication" +openviking grep "authentication" --uri viking://resources # 忽略大小写 -openviking grep viking://resources "authentication" --ignore-case +openviking grep "authentication" --uri viking://resources --ignore-case # 指定深度限制 -openviking grep viking://resources "TODO" --level-limit 3 +openviking grep "TODO" --uri viking://resources --level-limit 3 ``` **响应示例** diff --git a/examples/ov.conf.example b/examples/ov.conf.example index 58f806f365..3ff148e3f3 100644 --- a/examples/ov.conf.example +++ b/examples/ov.conf.example @@ -150,7 +150,7 @@ "threshold": 0.1, }, "retrieval": {"hotness_alpha": 0.0, "score_propagation_alpha": 1.0}, - "grep": {"engine": "auto", "switch_to_remote_threshold": 1000}, + "grep": {"engine": "auto", "switch_to_remote_threshold": 10000}, "auto_generate_l0": true, "auto_generate_l1": true, "default_search_mode": "thinking", diff --git a/examples/skills/ov-resources/docs/filesystem.md b/examples/skills/ov-resources/docs/filesystem.md index 31373ea2cd..9007f128a8 100644 --- a/examples/skills/ov-resources/docs/filesystem.md +++ b/examples/skills/ov-resources/docs/filesystem.md @@ -117,7 +117,7 @@ ov mv viking://resources/old-name/ viking://resources/new-name/ ### `ov grep` — Search by regex pattern ```bash -ov grep viking://resources/ "authentication" --ignore-case +ov grep "authentication" --uri viking://resources/ --ignore-case ``` Parameters: `uri`, `pattern` (required), `--ignore-case`, `--exclude-uri`, `--node-limit`, `--level-limit`. diff --git a/examples/skills/ov-resources/examples/commands.md b/examples/skills/ov-resources/examples/commands.md index 2b27c76da3..4c3a715899 100644 --- a/examples/skills/ov-resources/examples/commands.md +++ b/examples/skills/ov-resources/examples/commands.md @@ -101,7 +101,7 @@ ov rm viking://resources/old-project/ --recursive ```bash # Grep content -ov grep viking://resources/ "TODO" --ignore-case +ov grep "TODO" --uri viking://resources/ --ignore-case # Glob files ov glob "**/*.md" --uri viking://resources/ diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index 3f34bc9ae6..48594700b3 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -777,7 +777,7 @@ async def grep( exclude_uri: Optional URI prefix to exclude from search case_insensitive: Whether to perform case-insensitive matching node_limit: Maximum number of results to return - level_limit: Maximum depth level to traverse (default: 5) + level_limit: Maximum depth level to traverse (default: 10) ctx: Request context Internal bm25 recall limit is auto-adapted from node_limit as min(node_limit * 5, 100000); when node_limit is unset, use 100000. @@ -793,7 +793,7 @@ async def grep( # Read engine and threshold from grep_config (ov.conf) engine = self.grep_config.engine if self.grep_config else "auto" switch_to_remote_threshold = ( - self.grep_config.switch_to_remote_threshold if self.grep_config else 1000 + self.grep_config.switch_to_remote_threshold if self.grep_config else 10000 ) resolved_engine = await self._resolve_grep_engine( @@ -822,7 +822,7 @@ async def grep( ) async def _resolve_grep_engine( - self, engine: GrepEngine, uri: str, ctx, switch_to_remote_threshold: int = 1000 + self, engine: GrepEngine, uri: str, ctx, switch_to_remote_threshold: int = 10000 ) -> str: """Resolve the actual grep engine to use.""" if engine == "fs": @@ -1162,7 +1162,7 @@ async def _grep_encrypted( exclude_uri: Optional URI prefix to exclude from search case_insensitive: Whether to perform case-insensitive matching node_limit: Maximum number of results to return - level_limit: Maximum depth level to traverse (default: 5) + level_limit: Maximum depth level to traverse (default: 10) ctx: Request context Returns: diff --git a/tests/storage/test_viking_fs_grep.py b/tests/storage/test_viking_fs_grep.py index 11c7f10e6d..062d46297b 100644 --- a/tests/storage/test_viking_fs_grep.py +++ b/tests/storage/test_viking_fs_grep.py @@ -48,6 +48,26 @@ def test_grep_config_default_switch_to_remote_threshold_is_10000(): assert GrepConfig().switch_to_remote_threshold == 10000 +@pytest.mark.asyncio +async def test_grep_without_config_uses_documented_remote_threshold(monkeypatch): + fs = VikingFS(agfs=_DummyAgfs()) + vector_store = _DummyVectorStore() + vector_store._backend_type = "vikingdb" + monkeypatch.setattr(fs, "_get_vector_store", lambda: vector_store) + + async def fake_collection_has_fulltext(vector_store, ctx): + return True + + monkeypatch.setattr(fs, "_collection_has_fulltext", fake_collection_has_fulltext) + + async def fake_count(uri, ctx): + return 5000 + + monkeypatch.setattr(fs, "_get_cached_count", fake_count) + + assert await fs._resolve_grep_engine("auto", "viking://resources", None) == "fs" + + @pytest.mark.asyncio @pytest.mark.parametrize( ("node_limit", "expected_remote_limit"), From 8862316adff84842b4c1e8d463e278cadfc171a4 Mon Sep 17 00:00:00 2001 From: "liuyang.max" Date: Wed, 24 Jun 2026 10:59:44 +0800 Subject: [PATCH 30/31] optimize: increase truncate byte size to 1MB for content field for VikingDB --- docs/en/guides/01-configuration.md | 4 +++- docs/zh/guides/01-configuration.md | 4 +++- openviking/storage/viking_vector_index_backend.py | 2 +- tests/storage/test_collection_schemas.py | 6 ++++-- tests/storage/test_embedding_msg_converter_tenant.py | 2 +- 5 files changed, 12 insertions(+), 6 deletions(-) diff --git a/docs/en/guides/01-configuration.md b/docs/en/guides/01-configuration.md index d92edbf967..7518e191fd 100644 --- a/docs/en/guides/01-configuration.md +++ b/docs/en/guides/01-configuration.md @@ -896,7 +896,9 @@ Grep engine configuration for content pattern search. These settings are server- | Parameter | Type | Description | Default | |-----------|------|-------------|---------| | `engine` | str | Search engine mode: `"auto"` uses VikingDB BM25 recall when available and falls back to local filesystem search; `"fs"` forces local filesystem search only. | `"auto"` | -| `switch_to_remote_threshold` | int | L2 record count threshold to switch to VikingDB BM25 recall. When the number of L2 files under the search scope exceeds this threshold, VikingDB BM25 is used for phase-1 recall; otherwise local filesystem search is used. Set to `0` to always use VikingDB BM25. Must be ≥ 0. | `10000` | +| `switch_to_remote_threshold` | int | L2 record count threshold to switch to VikingDB BM25 recall. When the number of L2 files under the search scope reaches this threshold, VikingDB BM25 is used for phase-1 recall; otherwise local filesystem search is used. Set to `0` to always use VikingDB BM25. Must be ≥ 0. | `10000` | + +For VikingDB / Volcengine FullText grep, OpenViking writes a `content` text field for BM25 recall. The source context keeps the full content, while the vector-store write payload truncates this field to **1 MB** at the final adapter boundary to stay within backend payload limits. ### storage diff --git a/docs/zh/guides/01-configuration.md b/docs/zh/guides/01-configuration.md index 152f80f3b2..35fdf89909 100644 --- a/docs/zh/guides/01-configuration.md +++ b/docs/zh/guides/01-configuration.md @@ -867,7 +867,9 @@ Grep 引擎配置,用于内容模式搜索。这些设置为服务端配置, | 参数 | 类型 | 说明 | 默认值 | |------|------|------|--------| | `engine` | str | 搜索引擎模式:`"auto"` 在可用时使用 VikingDB BM25 召回,不可用时回退到本地文件系统搜索;`"fs"` 强制仅使用本地文件系统搜索。 | `"auto"` | -| `switch_to_remote_threshold` | int | 切换到 VikingDB BM25 召回的 L2 记录数阈值。当搜索范围内的 L2 文件数超过此阈值时,使用 VikingDB BM25 进行第一阶段召回;否则使用本地文件系统搜索。设为 `0` 表示始终使用 VikingDB BM25。必须 ≥ 0。 | `10000` | +| `switch_to_remote_threshold` | int | 切换到 VikingDB BM25 召回的 L2 记录数阈值。当搜索范围内的 L2 文件数达到此阈值时,使用 VikingDB BM25 进行第一阶段召回;否则使用本地文件系统搜索。设为 `0` 表示始终使用 VikingDB BM25。必须 ≥ 0。 | `10000` | + +对于 VikingDB / Volcengine FullText grep,OpenViking 会写入 `content` text 字段用于 BM25 召回。源上下文中保留完整内容,仅在最终写入向量库 adapter payload 时将该字段截断到 **1 MB**,以满足后端 payload 限制。 ### storage diff --git a/openviking/storage/viking_vector_index_backend.py b/openviking/storage/viking_vector_index_backend.py index 69d54b0ee4..9e411e650d 100644 --- a/openviking/storage/viking_vector_index_backend.py +++ b/openviking/storage/viking_vector_index_backend.py @@ -78,7 +78,7 @@ "account_id", ] -VIKINGDB_CONTENT_MAX_SIZE = 64 * 1024 +VIKINGDB_CONTENT_MAX_SIZE = 1024 * 1024 class _AsyncVectorAdapter: diff --git a/tests/storage/test_collection_schemas.py b/tests/storage/test_collection_schemas.py index 4efac92215..0da2711c3c 100644 --- a/tests/storage/test_collection_schemas.py +++ b/tests/storage/test_collection_schemas.py @@ -24,6 +24,7 @@ from openviking.storage.vectordb.collection.result import UpsertDataResult from openviking.storage.vectordb_adapters.local_adapter import LocalCollectionAdapter from openviking.storage.viking_vector_index_backend import ( + VIKINGDB_CONTENT_MAX_SIZE, VikingVectorIndexBackend, _SingleAccountBackend, ) @@ -799,7 +800,7 @@ def upsert(self, data): @pytest.mark.asyncio async def test_single_account_backend_truncates_content_only_at_vector_write(): captured = {} - full_content = "x" * (64 * 1024 + 17) + full_content = "x" * (1024 * 1024 + 17) class _Collection: def get_meta_data(self): @@ -840,7 +841,8 @@ def upsert(self, data): assert record_id == "rec-large" assert source_data["content"] == full_content - assert captured["data"]["content"] == full_content[: 64 * 1024] + assert VIKINGDB_CONTENT_MAX_SIZE == 1024 * 1024 + assert captured["data"]["content"] == full_content[:VIKINGDB_CONTENT_MAX_SIZE] @pytest.mark.asyncio diff --git a/tests/storage/test_embedding_msg_converter_tenant.py b/tests/storage/test_embedding_msg_converter_tenant.py index 61759548bb..357a40eedd 100644 --- a/tests/storage/test_embedding_msg_converter_tenant.py +++ b/tests/storage/test_embedding_msg_converter_tenant.py @@ -48,7 +48,7 @@ def test_embedding_msg_converter_backfills_account_and_owner_fields( def test_embedding_msg_converter_preserves_full_content_without_vikingdb_truncation(): - full_content = "x" * (64 * 1024 + 17) + full_content = "x" * (1024 * 1024 + 17) context = Context(uri="viking://resources/large.txt", abstract="short embedding text") context.set_vectorize(Vectorize(text="short embedding text", full_text=full_content)) From 111c126819a8d53ef973a12997f13e91a640ba83 Mon Sep 17 00:00:00 2001 From: Qin Haojie Date: Tue, 23 Jun 2026 16:00:22 +0800 Subject: [PATCH 31/31] fix(logger): harden queued stream logging (#2786) * fix(logger): replace StreamHandler with QueueHandler+QueueListener to prevent thread deadlock When log.output='stdout' (default) and the server is managed by systemd, concurrent log writes can deadlock because logging.StreamHandler holds a thread lock across stream.flush() which blocks on systemd-piped file I/O. During session.commit() phase 2, multiple async coroutines (memory extraction, summarization) concurrently call logger.info()/warning() with large payloads. The first thread's flush() blocks on the pipe, while all subsequent threads block on handler.acquire() forever. This permanently silences the server log and prevents _write_done_file() from executing, leaving phase 2 hanging without .done. Fix: use QueueHandler + QueueListener from stdlib logging.handlers (Python 3.2+). QueueHandler.emit() does queue.put(record) with no lock or I/O, returning immediately. QueueListener has a dedicated single thread as the sole consumer touching the real StreamHandler, making lock contention impossible. Changes in _create_log_handler(): stdout/stderr branches now create a shared QueueListener with unbounded queue, returning QueueHandler instances to callers. _build_standard_handler() delegates formatter and filter setup to the real handler in the listener thread. Closes: #2752 * fix(logger): harden queued stream logging --------- Co-authored-by: njuboy11 --- tests/server/test_api_code.py | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/tests/server/test_api_code.py b/tests/server/test_api_code.py index 614b72391f..25aabefc01 100644 --- a/tests/server/test_api_code.py +++ b/tests/server/test_api_code.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: AGPL-3.0 """Tests for /api/v1/code/* endpoints.""" -import pytest from openviking_cli.exceptions import PermissionDeniedError @@ -54,9 +53,7 @@ async def fake_read(uri, ctx=None, **_): monkeypatch.setattr(service.fs, "read", fake_read) - resp = await client.post( - "/api/v1/code/outline", json={"uri": "viking://resources/x.py"} - ) + resp = await client.post("/api/v1/code/outline", json={"uri": "viking://resources/x.py"}) assert resp.status_code == 403 body = resp.json() assert body["status"] == "error" @@ -80,9 +77,7 @@ async def fake_read(uri, ctx=None, **_): monkeypatch.setattr(service.fs, "read", fake_read) - resp = await client.post( - "/api/v1/code/outline", json={"uri": "viking://resources/x.py"} - ) + resp = await client.post("/api/v1/code/outline", json={"uri": "viking://resources/x.py"}) assert resp.status_code == 200 assert "is not text" in resp.json()["result"] @@ -116,18 +111,14 @@ async def fake_read(uri, ctx=None, **_): assert "viking://r/a.py" in body["result"] async def test_invalid_uri(self, client): - resp = await client.post( - "/api/v1/code/search", json={"uri": "/tmp/dir", "query": "foo"} - ) + resp = await client.post("/api/v1/code/search", json={"uri": "/tmp/dir", "query": "foo"}) assert resp.status_code == 200 body = resp.json() assert body["result"].startswith("Error:") assert "viking://" in body["result"] async def test_empty_query(self, client): - resp = await client.post( - "/api/v1/code/search", json={"uri": "viking://r", "query": ""} - ) + resp = await client.post("/api/v1/code/search", json={"uri": "viking://r", "query": ""}) assert resp.status_code == 200 assert resp.json()["result"] == "Error: empty query" @@ -137,9 +128,7 @@ async def fake_ls(uri, ctx=None, recursive=False, output=None, **_): monkeypatch.setattr(service.fs, "ls", fake_ls) - resp = await client.post( - "/api/v1/code/search", json={"uri": "viking://r", "query": "foo"} - ) + resp = await client.post("/api/v1/code/search", json={"uri": "viking://r", "query": "foo"}) assert resp.status_code == 200 assert "No supported source files" in resp.json()["result"] @@ -149,9 +138,7 @@ async def fake_ls(uri, ctx=None, recursive=False, output=None, **_): monkeypatch.setattr(service.fs, "ls", fake_ls) - resp = await client.post( - "/api/v1/code/search", json={"uri": "viking://r", "query": "foo"} - ) + resp = await client.post("/api/v1/code/search", json={"uri": "viking://r", "query": "foo"}) assert resp.status_code == 403 body = resp.json() assert body["status"] == "error" @@ -252,9 +239,7 @@ async def fake_read(uri, ctx=None, **_): assert "def greet" in resp.json()["result"] async def test_invalid_uri(self, client): - resp = await client.post( - "/api/v1/code/expand", json={"uri": "/tmp/x.py", "symbol": "foo"} - ) + resp = await client.post("/api/v1/code/expand", json={"uri": "/tmp/x.py", "symbol": "foo"}) assert resp.status_code == 200 body = resp.json() assert body["result"].startswith("Error:")