diff --git a/src/server/query_processor.py b/src/server/query_processor.py index f2f2ae90..5c37324a 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -93,6 +93,7 @@ async def _check_s3_cache( subpath=query.subpath, include_patterns=query.include_patterns, ignore_patterns=query.ignore_patterns, + max_file_size=query.max_file_size, ) # Check if file exists on S3 @@ -172,6 +173,7 @@ def _store_digest_content( subpath=query.subpath, include_patterns=query.include_patterns, ignore_patterns=query.ignore_patterns, + max_file_size=query.max_file_size, ) s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id) diff --git a/src/server/s3_utils.py b/src/server/s3_utils.py index f02f0270..7bec2553 100644 --- a/src/server/s3_utils.py +++ b/src/server/s3_utils.py @@ -65,6 +65,7 @@ def generate_s3_file_path( subpath: str, include_patterns: set[str] | None, ignore_patterns: set[str], + max_file_size: int, ) -> str: """Generate S3 file path with proper naming convention. @@ -92,6 +93,8 @@ def generate_s3_file_path( Set of patterns specifying which files to include. ignore_patterns : set[str] Set of patterns specifying which files to exclude. + max_file_size : int + Maximum file size in bytes to include in the ingestion. Returns ------- @@ -110,9 +113,10 @@ def generate_s3_file_path( logger.error(msg) raise ValueError(msg) - # Create hash of exclude/include patterns for uniqueness + # Create hash of exclude/include patterns and size for uniqueness patterns_str = f"include:{sorted(include_patterns) if include_patterns else []}" patterns_str += f"exclude:{sorted(ignore_patterns)}" + patterns_str += f"size:{max_file_size}" patterns_hash = hashlib.sha256(patterns_str.encode()).hexdigest()[:16] subpath_hash = hashlib.sha256(subpath.encode()).hexdigest()[:16]