-
Notifications
You must be signed in to change notification settings - Fork 9.2k
Fix: filter api in dataset document #14728
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -720,23 +720,25 @@ def list_docs(dataset_id, tenant_id): | |
| logging.error(f"You don't own the dataset {dataset_id}. ") | ||
| return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ") | ||
|
|
||
| err_code, err_msg, docs, total = _get_docs_with_request(request, dataset_id) | ||
| if request.args.get("type") == "filter": | ||
| err_code, err_msg, payload, total = _get_doc_filters_with_request(request, dataset_id) | ||
| if err_code != RetCode.SUCCESS: | ||
| return get_data_error_result(code=err_code, message=err_msg) | ||
| return get_json_result(data={"total": total, "filter": payload}) | ||
|
|
||
| err_code, err_msg, payload, total = _get_docs_with_request(request, dataset_id) | ||
| if err_code != RetCode.SUCCESS: | ||
| return get_data_error_result(code=err_code, message=err_msg) | ||
|
|
||
| if request.args.get("type") == "filter": | ||
| docs_filter = _aggregate_filters(docs) | ||
| return get_json_result(data={"total": total, "filter": docs_filter}) | ||
| else: | ||
| renamed_doc_list = [map_doc_keys(doc) for doc in docs] | ||
| for doc_item in renamed_doc_list: | ||
| if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX): | ||
| doc_item["thumbnail"] = f"/api/v1/documents/images/{dataset_id}-{doc_item['thumbnail']}" | ||
| if doc_item.get("source_type"): | ||
| doc_item["source_type"] = doc_item["source_type"].split("/")[0] | ||
| if doc_item["parser_config"].get("metadata"): | ||
| doc_item["parser_config"]["metadata"] = turn2jsonschema(doc_item["parser_config"]["metadata"]) | ||
| return get_json_result(data={"total": total, "docs": renamed_doc_list}) | ||
| renamed_doc_list = [map_doc_keys(doc) for doc in payload] | ||
| for doc_item in renamed_doc_list: | ||
| if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX): | ||
| doc_item["thumbnail"] = f"/api/v1/documents/images/{dataset_id}-{doc_item['thumbnail']}" | ||
| if doc_item.get("source_type"): | ||
| doc_item["source_type"] = doc_item["source_type"].split("/")[0] | ||
| if doc_item["parser_config"].get("metadata"): | ||
| doc_item["parser_config"]["metadata"] = turn2jsonschema(doc_item["parser_config"]["metadata"]) | ||
| return get_json_result(data={"total": total, "docs": renamed_doc_list}) | ||
|
|
||
|
|
||
| def _get_docs_with_request(req, dataset_id:str): | ||
|
|
@@ -832,6 +834,40 @@ def _get_docs_with_request(req, dataset_id:str): | |
|
|
||
| return RetCode.SUCCESS, "", docs, total | ||
|
|
||
|
|
||
| def _get_doc_filters_with_request(req, dataset_id: str): | ||
| """Get aggregated document filters with request parameters from a dataset.""" | ||
| q = req.args | ||
|
|
||
| keywords = q.get("keywords", "") | ||
|
|
||
| suffix = q.getlist("suffix") | ||
|
|
||
| types = q.getlist("types") | ||
| if types: | ||
| invalid_types = {t for t in types if t not in VALID_FILE_TYPES} | ||
| if invalid_types: | ||
| msg = f"Invalid filter conditions: {', '.join(invalid_types)} type{'s' if len(invalid_types) > 1 else ''}" | ||
| return RetCode.DATA_ERROR, msg, {}, 0 | ||
|
|
||
| run_status = q.getlist("run") | ||
| run_status_text_to_numeric = {"UNSTART": "0", "RUNNING": "1", "CANCEL": "2", "DONE": "3", "FAIL": "4"} | ||
| run_status_converted = [run_status_text_to_numeric.get(v, v) for v in run_status] | ||
| if run_status_converted: | ||
| invalid_status = {s for s in run_status_converted if s not in run_status_text_to_numeric.values()} | ||
| if invalid_status: | ||
| msg = f"Invalid filter run status conditions: {', '.join(invalid_status)}" | ||
| return RetCode.DATA_ERROR, msg, {}, 0 | ||
|
|
||
| docs_filter, total = DocumentService.get_filter_by_kb_id( | ||
| dataset_id, | ||
| keywords, | ||
| run_status_converted, | ||
| types, | ||
| suffix, | ||
| ) | ||
| return RetCode.SUCCESS, "", docs_filter, total | ||
|
Comment on lines
+838
to
+869
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "1) Inspect filter service API surface"
rg -n --type=py -C3 '\bdef\s+get_filter_by_kb_id\s*\('
echo
echo "2) Compare list-docs parsing logic vs filter parsing logic"
rg -n --type=py -C2 '_get_docs_with_request|_get_doc_filters_with_request|create_time_from|metadata_condition|return_empty_metadata|doc_ids|name=doc_name'
echo
echo "3) Check tests/usages that combine type=filter with metadata/time/id constraints"
rg -n -C2 'type=filter|metadata_condition|create_time_from|create_time_to|\\bids\\b|\\bid\\b'Repository: infiniflow/ragflow Length of output: 50375 🏁 Script executed: # Find where _get_doc_filters_with_request is called from (likely in routes)
rg -n --type=py '_get_doc_filters_with_request' -B 5 -A 10
# Find the endpoint/route definition that calls this function
rg -n --type=py 'def.*filter.*\(|@.*\.route.*filter' -A 5 api/apps/restful_apis/
# Check what parameters the API endpoint actually accepts from request
rg -n --type=py 'q\.get|q\.getlist' api/apps/restful_apis/document_api.py | head -40Repository: infiniflow/ragflow Length of output: 5846 🏁 Script executed: # Check if metadata_condition is parsed in _get_doc_filters_with_request
sed -n '838,870p' api/apps/restful_apis/document_api.py
# Check what _parse_doc_id_filter_with_metadata does and if it's used
rg -n '_parse_doc_id_filter_with_metadata' api/apps/restful_apis/document_api.py -A 30 | head -50
# Check tests that combine type=filter with metadata or time params
rg -n 'type=filter.*create_time|type=filter.*metadata|type=filter.*\bid\b|type=filter.*\bids\b' test/ sdk/ -r 2>/dev/null | head -20Repository: infiniflow/ragflow Length of output: 4298 🏁 Script executed: # Check the full implementation of _parse_doc_id_filter_with_metadata to understand metadata handling
sed -n '871,950p' api/apps/restful_apis/document_api.py
# Check DocumentService.get_filter_by_kb_id full implementation to see if it supports additional params
sed -n '185,250p' api/db/services/document_service.py
# Check if there are any tests that use type=filter with metadata/time/id params
rg -n 'type.*filter' test/testcases/ sdk/python/test/ -A 5 -B 2 | grep -E '(type|filter|metadata|create_time|\.get|\.post)' | head -40Repository: infiniflow/ragflow Length of output: 7736 🏁 Script executed: # Verify if the filter endpoint query parameters are actually accepted/documented
# by checking the route definition for the documents endpoint
rg -n '@.*\.route.*documents' api/apps/restful_apis/document_api.py -B 5 -A 20 | head -80
# Check if there are any tests that combine type=filter with metadata or time constraints
rg -n 'document_filter.*metadata|document_filter.*create_time|document_filter.*\bid\b|document_filter.*\bids\b' test/ -r -A 5 -B 5Repository: infiniflow/ragflow Length of output: 3198 🏁 Script executed: # Check the list_documents endpoint signature and documentation
sed -n '703,750p' api/apps/restful_apis/document_api.py
# Search more carefully for tests using document_filter
rg -n 'document_filter' test/ -A 3 -B 3 | grep -E '(metadata|create_time|id|ids|name)' | head -30
# Check the route documentation/swagger for the documents endpoint
rg -n 'parameters:' api/apps/restful_apis/document_api.py -A 100 | grep -A 100 'def list_documents' | head -80Repository: infiniflow/ragflow Length of output: 5877 🏁 Script executed: # Look for any test that calls document_filter with parameters besides the basic ones
rg -n 'document_filter.*\{' test/ -A 2 -B 1 | head -50
# Check the route documentation to see what parameters are documented as supported
sed -n '650,750p' api/apps/restful_apis/document_api.pyRepository: infiniflow/ragflow Length of output: 5530 🏁 Script executed: # Check if _get_doc_filters_with_request is newly added or if it's part of old code
git log -p --all -S '_get_doc_filters_with_request' -- api/apps/restful_apis/document_api.py 2>/dev/null | head -50 || echo "Git history not available"
# Check logging in the file
rg -n 'logging\.' api/apps/restful_apis/document_api.py | wc -l
# Check if there's logging in _get_doc_filters_with_request
sed -n '838,870p' api/apps/restful_apis/document_api.py | grep -i logRepository: infiniflow/ragflow Length of output: 65 🏁 Script executed: # Check if there's logging specifically in _get_doc_filters_with_request and _get_docs_with_request
echo "=== Logging in _get_docs_with_request ==="
sed -n '774,835p' api/apps/restful_apis/document_api.py | grep -i log
echo
echo "=== Logging in _get_doc_filters_with_request ==="
sed -n '838,870p' api/apps/restful_apis/document_api.py | grep -i log
# Check if there's ANY logging about which path is taken
sed -n '720,730p' api/apps/restful_apis/document_api.pyRepository: infiniflow/ragflow Length of output: 759 🏁 Script executed: # Verify the test actually passes ids to the filter endpoint and check if it expects them to be processed
sed -n '213,220p' test/testcases/test_web_api/test_document_app/test_document_metadata.py
# Check if there are any comments in the code explaining why filter endpoint doesn't support id/ids filtering
sed -n '720,870p' api/apps/restful_apis/document_api.py | grep -i 'filter\|ignore\|note\|todo\|fixme' -A 2 -B 2
# Verify the guidelines requirement about logging
grep -r 'Add logging' .Repository: infiniflow/ragflow Length of output: 4908
The Additionally, this new flow lacks logging as required by coding guidelines ( 🤖 Prompt for AI Agents |
||
|
|
||
| def _parse_doc_id_filter_with_metadata(req, kb_id): | ||
| """Parse document ID filter based on metadata conditions from the request. | ||
|
|
||
|
|
@@ -1053,65 +1089,6 @@ async def delete_documents(tenant_id, dataset_id): | |
| logging.exception(e) | ||
| return get_error_data_result(message="Internal server error") | ||
|
|
||
|
|
||
| def _aggregate_filters(docs): | ||
| """Aggregate filter options from a list of documents. | ||
|
|
||
| This function processes a list of document dictionaries and aggregates | ||
| available filter values for building filter UI on the client side. | ||
|
|
||
| Args: | ||
| docs (list): List of document dictionaries, each containing: | ||
| - id (str): Document ID | ||
| - suffix (str): File extension (e.g., "pdf", "docx") | ||
| - run (int): Parsing status code (0=UNSTART, 1=RUNNING, 2=CANCEL, 3=DONE, 4=FAIL) | ||
|
|
||
| Returns: | ||
| tuple: A tuple containing: | ||
| - dict: Aggregated filter options with keys: | ||
| - suffix: Dict mapping file extensions to document counts | ||
| - run_status: Dict mapping status codes to document counts | ||
| - metadata: Dict mapping metadata field names to value counts | ||
| - int: Total number of documents processed | ||
| """ | ||
| suffix_counter = {} | ||
| run_status_counter = {} | ||
| metadata_counter = {} | ||
| empty_metadata_count = 0 | ||
|
|
||
| for doc in docs: | ||
| suffix_counter[doc.get("suffix")] = suffix_counter.get(doc.get("suffix"), 0) + 1 | ||
| key_of_run = str(doc.get("run")) | ||
| run_status_counter[key_of_run] = run_status_counter.get(key_of_run, 0) + 1 | ||
| meta_fields = doc.get("meta_fields", {}) | ||
|
|
||
| if not meta_fields: | ||
| empty_metadata_count += 1 | ||
| continue | ||
| has_valid_meta = False | ||
|
|
||
| for key, value in meta_fields.items(): | ||
| values = value if isinstance(value, list) else [value] | ||
| for vv in values: | ||
| if vv is None: | ||
| continue | ||
| if isinstance(vv, str) and not vv.strip(): | ||
| continue | ||
| sv = str(vv) | ||
| if key not in metadata_counter: | ||
| metadata_counter[key] = {} | ||
| metadata_counter[key][sv] = metadata_counter[key].get(sv, 0) + 1 | ||
| has_valid_meta = True | ||
| if not has_valid_meta: | ||
| empty_metadata_count += 1 | ||
|
|
||
| metadata_counter["empty_metadata"] = {"true": empty_metadata_count} | ||
| return { | ||
| "suffix": suffix_counter, | ||
| "run_status": run_status_counter, | ||
| "metadata": metadata_counter, | ||
| } | ||
|
|
||
| @manager.route("/datasets/<dataset_id>/documents/<document_id>/metadata/config", methods=["PUT"]) # noqa: F821 | ||
| @login_required | ||
| @add_tenant_id_to_kwargs | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add logs for the new
type=filterflow and validation exits.Lines 723-727 and Lines 847-860 introduce new request flow and early-return branches without any logging. Please add at least debug entry + warning/error logs on validation failures for traceability.
Suggested patch
As per coding guidelines,
**/*.py: Add logging for new flows.Also applies to: 847-860
🤖 Prompt for AI Agents