perf: Speed up purge_table by deduplicating manifest reads and parallelizing file deletion

damahua · damahua · commit 7ec0217829af · 2026-04-10T22:17:36.000-07:00
Three changes to reduce purge_table wall time from ~7s to ~0.13s (54x) on a table with 200 snapshots:

1. Deduplicate manifests by path before iterating in delete_data_files().
   The same manifest appears across many snapshots' manifest lists.
   For 200 snapshots this reduces 20,100 manifest opens to 200.

2. Parallelize file deletion using the existing ExecutorFactory
   ThreadPoolExecutor, matching the pattern already used for manifest
   reading in plan_files() and data file reading in to_arrow().
   This aligns with the Java reference implementation (CatalogUtil.dropTableData)
   which also deletes files concurrently via a worker thread pool.

3. Cache Avro-to-Iceberg schema conversion and reader tree resolution.
   All manifests of the same type share the same Avro schema, but it was
   being JSON-parsed, converted, and resolved into a reader tree on every
   open. Uses explicit threading.Lock for thread safety across all Python
   implementations.
diff --git a/pyiceberg/avro/file.py b/pyiceberg/avro/file.py
@@ -22,11 +22,13 @@
 import io
 import json
 import os
+import threading
 from collections.abc import Callable
 from dataclasses import dataclass
 from enum import Enum
 from types import TracebackType
 from typing import (
+    Any,
     Generic,
     TypeVar,
 )
@@ -68,6 +70,41 @@
 _SCHEMA_KEY = "avro.schema"
 
 
+# Cache Avro-to-Iceberg schema conversion and resolved reader trees.
+# Manifests of the same type share the same Avro schema, so these caches
+# avoid redundant JSON parsing, schema conversion, and reader tree construction.
+# Reader objects are stateless — read() takes a decoder and returns decoded
+# data without mutating self, so sharing across threads/calls is safe.
+# Uses explicit locking instead of lru_cache for thread safety across all
+# Python implementations (not just CPython).
+_schema_cache: dict[str, Schema] = {}
+_reader_cache: dict[tuple[Any, ...], Reader] = {}
+_cache_lock = threading.Lock()
+
+
+def _cached_avro_to_iceberg(avro_schema_string: str) -> Schema:
+    if avro_schema_string not in _schema_cache:
+        with _cache_lock:
+            if avro_schema_string not in _schema_cache:
+                avro_schema = json.loads(avro_schema_string)
+                _schema_cache[avro_schema_string] = AvroSchemaConversion().avro_to_iceberg(avro_schema)
+    return _schema_cache[avro_schema_string]
+
+
+def _cached_resolve_reader(
+    file_schema: Schema,
+    read_schema: Schema,
+    read_types: dict[int, Callable[..., StructProtocol]],
+    read_enums: dict[int, Callable[..., Enum]],
+) -> Reader:
+    key = (str(file_schema), str(read_schema), tuple(sorted(read_types.items())), tuple(sorted(read_enums.items())))
+    if key not in _reader_cache:
+        with _cache_lock:
+            if key not in _reader_cache:
+                _reader_cache[key] = resolve_reader(file_schema, read_schema, read_types, read_enums)
+    return _reader_cache[key]
+
+
 class AvroFileHeader(Record):
     @property
     def magic(self) -> bytes:
@@ -97,9 +134,7 @@ def compression_codec(self) -> type[Codec] | None:
 
     def get_schema(self) -> Schema:
         if _SCHEMA_KEY in self.meta:
-            avro_schema_string = self.meta[_SCHEMA_KEY]
-            avro_schema = json.loads(avro_schema_string)
-            return AvroSchemaConversion().avro_to_iceberg(avro_schema)
+            return _cached_avro_to_iceberg(self.meta[_SCHEMA_KEY])
         else:
             raise ValueError("No schema found in Avro file headers")
 
@@ -178,7 +213,7 @@ def __enter__(self) -> AvroFile[D]:
         if not self.read_schema:
             self.read_schema = self.schema
 
-        self.reader = resolve_reader(self.schema, self.read_schema, self.read_types, self.read_enums)
+        self.reader = _cached_resolve_reader(self.schema, self.read_schema, self.read_types, self.read_enums)
 
         return self
 
diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py
@@ -90,6 +90,7 @@
 MANIFEST_LIST = "manifest list"
 PREVIOUS_METADATA = "previous metadata"
 METADATA = "metadata"
+DATA_FILE = "data"
 URI = "uri"
 LOCATION = "location"
 EXTERNAL_TABLE = "EXTERNAL_TABLE"
@@ -284,7 +285,7 @@ def list_catalogs() -> list[str]:
 
 
 def delete_files(io: FileIO, files_to_delete: set[str], file_type: str) -> None:
-    """Delete files.
+    """Delete files in parallel.
 
     Log warnings if failing to delete any file.
 
@@ -293,32 +294,42 @@ def delete_files(io: FileIO, files_to_delete: set[str], file_type: str) -> None:
         files_to_delete: A set of file paths to be deleted.
         file_type: The type of the file.
     """
-    for file in files_to_delete:
+    from pyiceberg.utils.concurrent import ExecutorFactory
+
+    def _delete_file(file: str) -> None:
         try:
             io.delete(file)
         except OSError:
             logger.warning(f"Failed to delete {file_type} file {file}", exc_info=logger.isEnabledFor(logging.DEBUG))
 
+    executor = ExecutorFactory.get_or_create()
+    list(executor.map(_delete_file, files_to_delete))
+
 
 def delete_data_files(io: FileIO, manifests_to_delete: list[ManifestFile]) -> None:
     """Delete data files linked to given manifests.
 
-    Log warnings if failing to delete any file.
+    Deduplicates manifests by path (the same manifest appears across many snapshots)
+    and deletes data files in parallel.
 
     Args:
         io: The FileIO used to delete the object.
         manifests_to_delete: A list of manifest contains paths of data files to be deleted.
     """
-    deleted_files: dict[str, bool] = {}
+    # Deduplicate manifests — with N snapshots, each manifest appears in up to N snapshot
+    # manifest lists. For 200 snapshots this reduces 20,100 manifest opens to 200.
+    unique_manifests: dict[str, ManifestFile] = {}
     for manifest_file in manifests_to_delete:
+        unique_manifests.setdefault(manifest_file.manifest_path, manifest_file)
+
+    # Collect all unique data file paths
+    data_file_paths: set[str] = set()
+    for manifest_file in unique_manifests.values():
         for entry in manifest_file.fetch_manifest_entry(io, discard_deleted=False):
-            path = entry.data_file.file_path
-            if not deleted_files.get(path, False):
-                try:
-                    io.delete(path)
-                except OSError:
-                    logger.warning(f"Failed to delete data file {path}", exc_info=logger.isEnabledFor(logging.DEBUG))
-                deleted_files[path] = True
+            data_file_paths.add(entry.data_file.file_path)
+
+    # Delete in parallel
+    delete_files(io, data_file_paths, DATA_FILE)
 
 
 def _import_catalog(name: str, catalog_impl: str, properties: Properties) -> Catalog | None: