From f2be9cc3580c4cf1f987bca7ae0f41d6ed9cdaf5 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Wed, 1 Apr 2026 02:01:12 +0800 Subject: [PATCH 01/21] python: add explicit cuvs accelerator path --- python/python/lance/cuvs.py | 238 +++++++++++++++++++++++ python/python/lance/dataset.py | 142 +++++++++----- python/python/lance/indices/builder.py | 25 +++ python/python/tests/test_vector_index.py | 129 ++++++++++++ 4 files changed, 480 insertions(+), 54 deletions(-) create mode 100644 python/python/lance/cuvs.py diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py new file mode 100644 index 00000000000..ab46eb8a432 --- /dev/null +++ b/python/python/lance/cuvs.py @@ -0,0 +1,238 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +from __future__ import annotations + +from importlib import import_module +from typing import Tuple + +import pyarrow as pa + +from .dependencies import numpy as np + + +def is_cuvs_accelerator(accelerator: object) -> bool: + return accelerator == "cuvs" + + +def _require_cuvs(): + try: + return import_module("cuvs.neighbors.ivf_pq") + except ModuleNotFoundError as exc: + raise ModuleNotFoundError( + "accelerator='cuvs' requires the 'cuvs' package to be installed" + ) from exc + + +def _optional_cupy(): + try: + return import_module("cupy") + except ModuleNotFoundError: + return None + + +def _metric_to_cuvs(metric_type: str) -> str: + metric_type = metric_type.lower() + if metric_type in {"l2", "euclidean"}: + return "sqeuclidean" + if metric_type == "dot": + return "inner_product" + if metric_type == "cosine": + return "cosine" + raise ValueError(f"Metric '{metric_type}' is not supported by cuVS IVF_PQ") + + +def _column_to_numpy(table: pa.Table, column: str) -> np.ndarray: + array = table.column(column).combine_chunks() + values = array.to_pylist() + if len(values) == 0: + raise ValueError("cuVS training requires at least one training vector") + matrix = np.asarray(values) + if matrix.ndim != 2: + raise ValueError( + f"Expected a 2D training matrix for column '{column}', got {matrix.shape}" + ) + if matrix.dtype == np.float64: + matrix = matrix.astype(np.float32) + elif matrix.dtype not in (np.float16, np.float32): + matrix = matrix.astype(np.float32) + return matrix + + +def _as_numpy(array_like) -> np.ndarray: + if isinstance(array_like, np.ndarray): + return array_like + try: + array = np.asarray(array_like) + if isinstance(array, np.ndarray): + return array + except Exception: + pass + + if hasattr(array_like, "get"): + return np.asarray(array_like.get()) + + cupy = _optional_cupy() + if cupy is not None: + return cupy.asnumpy(array_like) + + raise TypeError("Unable to convert cuVS output to numpy") + + +def _normalize_centroids(index, num_partitions: int, dimension: int) -> np.ndarray: + centroids = _as_numpy(index.centers) + if centroids.shape != (num_partitions, dimension): + raise ValueError( + "cuVS returned incompatible IVF centroids shape: " + f"expected {(num_partitions, dimension)}, got {centroids.shape}" + ) + return centroids + + +def _normalize_pq_codebook( + index, num_sub_vectors: int, num_bits: int, dimension: int +) -> np.ndarray: + pq_book_size = 1 << num_bits + subvector_dim = dimension // num_sub_vectors + pq_centers = _as_numpy(index.pq_centers) + + expected_shapes = { + (num_sub_vectors, subvector_dim, pq_book_size): (0, 2, 1), + (num_sub_vectors, pq_book_size, subvector_dim): None, + } + transpose = expected_shapes.get(pq_centers.shape) + if transpose is None and pq_centers.shape not in expected_shapes: + raise ValueError( + "cuVS returned incompatible PQ codebook shape: expected one of " + f"{list(expected_shapes.keys())}, got {pq_centers.shape}" + ) + if transpose is not None: + pq_centers = np.transpose(pq_centers, transpose) + return pq_centers + + +def _estimate_trainset_fraction( + num_rows: int, num_partitions: int, sample_rate: int +) -> float: + if num_rows <= 0: + raise ValueError("cuVS training requires a non-empty dataset") + desired_rows = max(num_partitions * sample_rate, 256 * 256) + return min(1.0, desired_rows / num_rows) + + +def train_ivf_pq_on_cuvs( + dataset, + column: str, + num_partitions: int, + metric_type: str, + accelerator: str, + num_sub_vectors: int, + *, + sample_rate: int = 256, + max_iters: int = 50, + num_bits: int = 8, + filter_nan: bool = True, +) -> Tuple[np.ndarray, np.ndarray]: + if accelerator != "cuvs": + raise ValueError("cuVS acceleration only supports accelerator='cuvs'") + if num_bits != 8: + raise ValueError("cuVS IVF_PQ integration currently supports only num_bits=8") + + dimension = dataset.schema.field(column).type.list_size + if dimension % num_sub_vectors != 0: + raise ValueError( + "cuVS IVF_PQ integration requires vector dimension to be divisible by " + "num_sub_vectors" + ) + + if dataset.schema.field(column).nullable and filter_nan: + filt = f"{column} is not null" + else: + filt = None + + num_rows = dataset.count_rows(filter=filt) + if num_rows == 0: + raise ValueError("cuVS training requires at least one non-null training vector") + + train_rows = max(1, min(num_rows, max(num_partitions * sample_rate, 256 * 256))) + trainset = dataset.sample( + train_rows, + columns=[column], + filter=filt, + randomize_order=True, + ) + matrix = _column_to_numpy(trainset, column) + + ivf_pq = _require_cuvs() + build_params = ivf_pq.IndexParams( + n_lists=num_partitions, + metric=_metric_to_cuvs(metric_type), + kmeans_n_iters=max_iters, + kmeans_trainset_fraction=_estimate_trainset_fraction( + matrix.shape[0], num_partitions, sample_rate + ), + pq_bits=num_bits, + pq_dim=num_sub_vectors, + codebook_kind="subspace", + force_random_rotation=False, + add_data_on_build=False, + ) + + index = ivf_pq.build(build_params, matrix) + + centroids = _normalize_centroids(index, num_partitions, dimension) + pq_codebook = _normalize_pq_codebook(index, num_sub_vectors, num_bits, dimension) + return centroids, pq_codebook + + +def one_pass_train_ivf_pq_on_cuvs( + dataset, + column: str, + num_partitions: int, + metric_type: str, + accelerator: str, + num_sub_vectors: int, + *, + sample_rate: int = 256, + max_iters: int = 50, + num_bits: int = 8, + filter_nan: bool = True, +): + return train_ivf_pq_on_cuvs( + dataset, + column, + num_partitions, + metric_type, + accelerator, + num_sub_vectors, + sample_rate=sample_rate, + max_iters=max_iters, + num_bits=num_bits, + filter_nan=filter_nan, + ) + + +def prepare_global_ivf_pq_on_cuvs( + dataset, + column: str, + num_partitions: int, + num_sub_vectors: int, + *, + distance_type: str = "l2", + accelerator: str = "cuvs", + sample_rate: int = 256, + max_iters: int = 50, + num_bits: int = 8, +): + centroids, pq_codebook = train_ivf_pq_on_cuvs( + dataset, + column, + num_partitions, + distance_type, + accelerator, + num_sub_vectors, + sample_rate=sample_rate, + max_iters=max_iters, + num_bits=num_bits, + ) + return {"ivf_centroids": centroids, "pq_codebook": pq_codebook} diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 7496746285a..a5e1681b250 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -39,6 +39,7 @@ from lance.log import LOGGER from .blob import BlobFile +from .cuvs import is_cuvs_accelerator from .dependencies import ( _check_for_numpy, _check_for_torch, @@ -2899,20 +2900,24 @@ def _create_index_impl( # Handle timing for various parts of accelerated builds timers = {} + use_cuvs = is_cuvs_accelerator(accelerator) if accelerator is not None and index_type != "IVF_PQ": + if use_cuvs: + raise ValueError( + f"accelerator='{accelerator}' only supports IVF_PQ index builds" + ) LOGGER.warning( "Index type %s does not support GPU acceleration; falling back to CPU", index_type, ) accelerator = None + use_cuvs = False # IMPORTANT: Distributed indexing is CPU-only. Enforce single-node when - # accelerator or torch-related paths are detected. - torch_detected = False + # any Python-side accelerator path is selected. + accelerated_build_detected = accelerator is not None try: - if accelerator is not None: - torch_detected = True - else: + if accelerator is None: impl = kwargs.get("implementation") use_torch_flag = kwargs.get("use_torch") is True one_pass_flag = kwargs.get("one_pass_ivfpq") is True @@ -2925,16 +2930,16 @@ def _create_index_impl( or torch_centroids or torch_codebook ): - torch_detected = True + accelerated_build_detected = True except Exception: # Be conservative: if detection fails, do not modify behavior pass - if torch_detected: + if accelerated_build_detected: if require_commit: if fragment_ids is not None or index_uuid is not None: LOGGER.info( - "Torch detected; " + "Accelerated build detected; " "enforce single-node indexing (distributed is CPU-only)." ) fragment_ids = None @@ -2942,63 +2947,92 @@ def _create_index_impl( else: if index_uuid is not None: LOGGER.info( - "Torch detected; " + "Accelerated build detected; " "enforce single-node indexing (distributed is CPU-only)." ) index_uuid = None if accelerator is not None: - from .vector import ( - one_pass_assign_ivf_pq_on_accelerator, - one_pass_train_ivf_pq_on_accelerator, - ) - - LOGGER.info("Doing one-pass ivfpq accelerated computations") if num_partitions is None: num_rows = self.count_rows() num_partitions = _target_partition_size_to_num_partitions( num_rows, target_partition_size ) - timers["ivf+pq_train:start"] = time.time() - ( - ivf_centroids, - ivf_kmeans, - pq_codebook, - pq_kmeans_list, - ) = one_pass_train_ivf_pq_on_accelerator( - self, - column[0], - num_partitions, - metric, - accelerator, - num_sub_vectors=num_sub_vectors, - batch_size=20480, - filter_nan=filter_nan, - ) - timers["ivf+pq_train:end"] = time.time() - ivfpq_train_time = timers["ivf+pq_train:end"] - timers["ivf+pq_train:start"] - LOGGER.info("ivf+pq training time: %ss", ivfpq_train_time) - timers["ivf+pq_assign:start"] = time.time() - shuffle_output_dir, shuffle_buffers = one_pass_assign_ivf_pq_on_accelerator( - self, - column[0], - metric, - accelerator, - ivf_kmeans, - pq_kmeans_list, - batch_size=20480, - filter_nan=filter_nan, - ) - timers["ivf+pq_assign:end"] = time.time() - ivfpq_assign_time = ( - timers["ivf+pq_assign:end"] - timers["ivf+pq_assign:start"] - ) - LOGGER.info("ivf+pq transform time: %ss", ivfpq_assign_time) - kwargs["precomputed_shuffle_buffers"] = shuffle_buffers - kwargs["precomputed_shuffle_buffers_path"] = os.path.join( - shuffle_output_dir, "data" - ) + if use_cuvs: + from .cuvs import one_pass_train_ivf_pq_on_cuvs + + LOGGER.info("Doing one-pass ivfpq cuVS training") + timers["ivf+pq_train:start"] = time.time() + ivf_centroids, pq_codebook = one_pass_train_ivf_pq_on_cuvs( + self, + column[0], + num_partitions, + metric, + accelerator, + num_sub_vectors=num_sub_vectors, + sample_rate=kwargs.get("sample_rate", 256), + max_iters=kwargs.get("max_iters", 50), + num_bits=kwargs.get("num_bits", 8), + filter_nan=filter_nan, + ) + timers["ivf+pq_train:end"] = time.time() + ivfpq_train_time = ( + timers["ivf+pq_train:end"] - timers["ivf+pq_train:start"] + ) + LOGGER.info("cuVS ivf+pq training time: %ss", ivfpq_train_time) + else: + from .vector import ( + one_pass_assign_ivf_pq_on_accelerator, + one_pass_train_ivf_pq_on_accelerator, + ) + + LOGGER.info("Doing one-pass ivfpq accelerated computations") + timers["ivf+pq_train:start"] = time.time() + ( + ivf_centroids, + ivf_kmeans, + pq_codebook, + pq_kmeans_list, + ) = one_pass_train_ivf_pq_on_accelerator( + self, + column[0], + num_partitions, + metric, + accelerator, + num_sub_vectors=num_sub_vectors, + batch_size=20480, + filter_nan=filter_nan, + ) + timers["ivf+pq_train:end"] = time.time() + ivfpq_train_time = ( + timers["ivf+pq_train:end"] - timers["ivf+pq_train:start"] + ) + LOGGER.info("ivf+pq training time: %ss", ivfpq_train_time) + timers["ivf+pq_assign:start"] = time.time() + ( + shuffle_output_dir, + shuffle_buffers, + ) = one_pass_assign_ivf_pq_on_accelerator( + self, + column[0], + metric, + accelerator, + ivf_kmeans, + pq_kmeans_list, + batch_size=20480, + filter_nan=filter_nan, + ) + timers["ivf+pq_assign:end"] = time.time() + ivfpq_assign_time = ( + timers["ivf+pq_assign:end"] - timers["ivf+pq_assign:start"] + ) + LOGGER.info("ivf+pq transform time: %ss", ivfpq_assign_time) + + kwargs["precomputed_shuffle_buffers"] = shuffle_buffers + kwargs["precomputed_shuffle_buffers_path"] = os.path.join( + shuffle_output_dir, "data" + ) if index_type.startswith("IVF"): if (ivf_centroids is not None) and (ivf_centroids_file is not None): raise ValueError( diff --git a/python/python/lance/indices/builder.py b/python/python/lance/indices/builder.py index c31ea0a7a0c..00591ead934 100644 --- a/python/python/lance/indices/builder.py +++ b/python/python/lance/indices/builder.py @@ -9,6 +9,7 @@ import numpy as np import pyarrow as pa +from lance.cuvs import is_cuvs_accelerator, prepare_global_ivf_pq_on_cuvs from lance.indices.ivf import IvfModel from lance.indices.pq import PqModel @@ -115,6 +116,11 @@ def train_ivf( self._verify_ivf_sample_rate(sample_rate, num_partitions, num_rows) distance_type = self._normalize_distance_type(distance_type) self._verify_ivf_params(num_partitions) + if is_cuvs_accelerator(accelerator): + raise NotImplementedError( + "IndicesBuilder.train_ivf does not support accelerator='cuvs'; " + "use prepare_global_ivf_pq instead" + ) if accelerator is None: from lance.lance import indices @@ -250,6 +256,25 @@ def prepare_global_ivf_pq( `IndicesBuilder.train_pq` (indices.train_pq_model). No public method names elsewhere are changed. """ + if is_cuvs_accelerator(accelerator): + if fragment_ids is not None: + raise NotImplementedError( + "fragment_ids is not supported with accelerator='cuvs'" + ) + num_rows = self._count_rows() + num_partitions = self._determine_num_partitions(num_partitions, num_rows) + num_subvectors = self._normalize_pq_params(num_subvectors, self.dimension) + return prepare_global_ivf_pq_on_cuvs( + self.dataset, + self.column[0], + num_partitions, + num_subvectors, + distance_type=distance_type, + accelerator=accelerator, + sample_rate=sample_rate, + max_iters=max_iters, + ) + # Global IVF training ivf_model = self.train_ivf( num_partitions, diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index b20ffc8cf7a..54c6003c278 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -9,10 +9,12 @@ import string import tempfile import time +from importlib import import_module from pathlib import Path from typing import Optional import lance +import lance.cuvs as lance_cuvs import numpy as np import pyarrow as pa import pyarrow.compute as pc @@ -505,6 +507,15 @@ def test_create_index_unsupported_accelerator(tmp_path): accelerator="cuda:abc", ) + with pytest.raises(ValueError): + dataset.create_index( + "vector", + index_type="IVF_PQ", + num_partitions=4, + num_sub_vectors=16, + accelerator="cuvs:0", + ) + def test_create_index_accelerator_fallback(tmp_path, caplog): tbl = create_table() @@ -526,6 +537,124 @@ def test_create_index_accelerator_fallback(tmp_path, caplog): ) +def test_create_index_cuvs_dispatch(tmp_path, monkeypatch): + tbl = create_table(nvec=512, ndim=128) + dataset = lance.write_dataset(tbl, tmp_path) + calls = {} + + def fake_train( + dataset_arg, + column, + num_partitions, + metric_type, + accelerator, + num_sub_vectors, + *, + sample_rate, + max_iters, + num_bits, + filter_nan, + ): + calls["dataset"] = dataset_arg + calls["column"] = column + calls["num_partitions"] = num_partitions + calls["metric_type"] = metric_type + calls["accelerator"] = accelerator + calls["num_sub_vectors"] = num_sub_vectors + calls["sample_rate"] = sample_rate + calls["max_iters"] = max_iters + calls["num_bits"] = num_bits + calls["filter_nan"] = filter_nan + return ( + np.random.randn(num_partitions, 128).astype(np.float32), + np.random.randn(num_sub_vectors, 256, 128 // num_sub_vectors).astype( + np.float32 + ), + ) + + monkeypatch.setattr(lance_cuvs, "one_pass_train_ivf_pq_on_cuvs", fake_train) + + dataset = dataset.create_index( + "vector", + index_type="IVF_PQ", + num_partitions=4, + num_sub_vectors=16, + accelerator="cuvs", + ) + + assert calls["column"] == "vector" + assert calls["num_partitions"] == 4 + assert calls["metric_type"] == "L2" + assert calls["accelerator"] == "cuvs" + assert calls["num_sub_vectors"] == 16 + assert dataset.stats.index_stats("vector_idx")["index_type"] == "IVF_PQ" + + +def test_create_index_cuvs_rejects_non_ivf_pq(tmp_path): + tbl = create_table() + dataset = lance.write_dataset(tbl, tmp_path) + + with pytest.raises(ValueError, match="only supports IVF_PQ"): + dataset.create_index( + "vector", + index_type="IVF_FLAT", + num_partitions=4, + accelerator="cuvs", + ) + + +def test_prepare_global_ivf_pq_cuvs_dispatch(tmp_path, monkeypatch): + ds = _make_sample_dataset_base(tmp_path, "cuvs_prepare_ds", 512, 128) + builder = IndicesBuilder(ds, "vector") + builder_module = import_module("lance.indices.builder") + calls = {} + + def fake_prepare( + dataset_arg, + column, + num_partitions, + num_sub_vectors, + *, + distance_type, + accelerator, + sample_rate, + max_iters, + ): + calls["dataset"] = dataset_arg + calls["column"] = column + calls["num_partitions"] = num_partitions + calls["num_sub_vectors"] = num_sub_vectors + calls["distance_type"] = distance_type + calls["accelerator"] = accelerator + calls["sample_rate"] = sample_rate + calls["max_iters"] = max_iters + return { + "ivf_centroids": np.random.randn(num_partitions, 128).astype(np.float32), + "pq_codebook": np.random.randn( + num_sub_vectors, 256, 128 // num_sub_vectors + ).astype(np.float32), + } + + monkeypatch.setattr(builder_module, "prepare_global_ivf_pq_on_cuvs", fake_prepare) + + prepared = builder.prepare_global_ivf_pq( + num_partitions=4, + num_subvectors=16, + distance_type="l2", + accelerator="cuvs", + sample_rate=7, + max_iters=20, + ) + + assert calls["column"] == "vector" + assert calls["num_partitions"] == 4 + assert calls["num_sub_vectors"] == 16 + assert calls["distance_type"] == "l2" + assert calls["accelerator"] == "cuvs" + assert prepared["ivf_centroids"].shape == (4, 128) + assert prepared["pq_codebook"].shape == (16, 256, 8) + + def test_use_index(dataset, tmp_path): ann_ds = lance.write_dataset(dataset.to_table(), tmp_path / "indexed.lance") ann_ds = ann_ds.create_index( From 2f071f6a28fa399df7e19cd551adcb5dc663f5f3 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Wed, 1 Apr 2026 02:05:54 +0800 Subject: [PATCH 02/21] python: document cuvs installation requirements --- python/DEVELOPMENT.md | 16 ++++++++++++++++ python/python/lance/cuvs.py | 4 +++- python/python/lance/dataset.py | 20 ++++++++++++++------ 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/python/DEVELOPMENT.md b/python/DEVELOPMENT.md index 12c56549608..21dba0bdddd 100644 --- a/python/DEVELOPMENT.md +++ b/python/DEVELOPMENT.md @@ -8,6 +8,22 @@ uv sync --extra tests --extra dev Add extras such as `benchmarks`, `torch`, or `geo` only when you need them. After the environment is initialized, either activate it or use `uv run ...` for commands. +`accelerator="cuvs"` does not have a normal project extra today. cuVS Python +packages are published per CUDA major version and are typically installed from +NVIDIA's package index, for example: + +```shell +uv pip install --extra-index-url https://pypi.nvidia.com cuvs-cu12 +``` + +or: + +```shell +uv pip install --extra-index-url https://pypi.nvidia.com cuvs-cu13 +``` + +Pick the package that matches the CUDA version in your environment. + `uv sync` is not just downloading Python packages here. It also builds the local `pylance` Rust extension as part of the editable environment, so the first run, cache misses, or Rust dependency changes can make it noticeably slow. This is expected; let the build finish instead of interrupting it and switching to a different environment setup. ## Building the project diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py index ab46eb8a432..6c0a4085c5c 100644 --- a/python/python/lance/cuvs.py +++ b/python/python/lance/cuvs.py @@ -20,7 +20,9 @@ def _require_cuvs(): return import_module("cuvs.neighbors.ivf_pq") except ModuleNotFoundError as exc: raise ModuleNotFoundError( - "accelerator='cuvs' requires the 'cuvs' package to be installed" + "accelerator='cuvs' requires cuVS Python bindings to be installed. " + "Install a CUDA-matched package such as 'cuvs-cu12' or 'cuvs-cu13' " + "from https://pypi.nvidia.com." ) from exc diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index a5e1681b250..cda142f5bb4 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -3263,7 +3263,12 @@ def create_index( The number of sub-vectors for PQ (Product Quantization). accelerator : str or ``torch.Device``, optional If set, use an accelerator to speed up the training process. - Accepted accelerator: "cuda" (Nvidia GPU) and "mps" (Apple Silicon GPU). + Accepted accelerator: + + - "cuda" or ``torch.device(...)`` for the existing torch-based path + - "mps" for Apple Silicon GPU + - "cuvs" for the explicit cuVS-based IVF_PQ training path + If not set, use the CPU. index_cache_size : int, optional The size of the index cache in number of entries. Default value is 256. @@ -3372,8 +3377,10 @@ def create_index( Experimental Accelerator (GPU) support: - *accelerate*: use GPU to train IVF partitions. - Only supports CUDA (Nvidia) or MPS (Apple) currently. - Requires PyTorch being installed. + `accelerator="cuda"` and `accelerator="mps"` use the existing torch path. + `accelerator="cuvs"` uses cuVS for IVF_PQ training only. + The torch path requires PyTorch. The cuVS path requires the cuVS Python + bindings to be installed separately. .. code-block:: python @@ -3388,9 +3395,10 @@ def create_index( accelerator="cuda" ) - Note: GPU acceleration is currently supported only for the ``IVF_PQ`` index - type. Providing an accelerator for other index types will fall back to CPU - index building. + Note: accelerator support is currently limited to the ``IVF_PQ`` index type. + Providing ``accelerator="cuda"`` for other index types will fall back to CPU + index building. Providing ``accelerator="cuvs"`` for other index types will + raise an error. References ---------- From 1a6c44b7a1630d6ab8fe14aeea8dcce2d6c8bf99 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Wed, 1 Apr 2026 14:17:05 +0800 Subject: [PATCH 03/21] python: fix cuvs training on real datasets --- python/python/lance/cuvs.py | 26 ++++++++++--- python/python/tests/test_vector_index.py | 48 ++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 6 deletions(-) diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py index 6c0a4085c5c..0bfa910cb21 100644 --- a/python/python/lance/cuvs.py +++ b/python/python/lance/cuvs.py @@ -7,6 +7,7 @@ from typing import Tuple import pyarrow as pa +import pyarrow.compute as pc from .dependencies import numpy as np @@ -64,6 +65,10 @@ def _column_to_numpy(table: pa.Table, column: str) -> np.ndarray: def _as_numpy(array_like) -> np.ndarray: if isinstance(array_like, np.ndarray): return array_like + + if hasattr(array_like, "copy_to_host"): + return np.asarray(array_like.copy_to_host()) + try: array = np.asarray(array_like) if isinstance(array, np.ndarray): @@ -122,6 +127,20 @@ def _estimate_trainset_fraction( return min(1.0, desired_rows / num_rows) +def _sample_training_table(dataset, column: str, train_rows: int, filt: str | None) -> pa.Table: + if filt is None: + return dataset.sample(train_rows, columns=[column], randomize_order=True) + + total_rows = dataset.count_rows() + sample_rows = min(total_rows, max(train_rows * 2, train_rows + 1024)) + trainset = dataset.sample(sample_rows, columns=[column], randomize_order=True) + trainset = trainset.filter(pc.is_valid(trainset.column(column))) + if len(trainset) >= train_rows or sample_rows == total_rows: + return trainset.slice(0, min(train_rows, len(trainset))) + + return dataset.to_table(columns=[column], filter=filt, limit=train_rows) + + def train_ivf_pq_on_cuvs( dataset, column: str, @@ -157,12 +176,7 @@ def train_ivf_pq_on_cuvs( raise ValueError("cuVS training requires at least one non-null training vector") train_rows = max(1, min(num_rows, max(num_partitions * sample_rate, 256 * 256))) - trainset = dataset.sample( - train_rows, - columns=[column], - filter=filt, - randomize_order=True, - ) + trainset = _sample_training_table(dataset, column, train_rows, filt) matrix = _column_to_numpy(trainset, column) ivf_pq = _require_cuvs() diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 54c6003c278..b92952e5f3c 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -655,6 +655,54 @@ def fake_prepare( assert prepared["pq_codebook"].shape == (16, 256, 8) +def test_train_ivf_pq_on_cuvs_nullable_vectors(tmp_path, monkeypatch): + tbl = create_table(nvec=32, ndim=16, nullify=True) + dataset = lance.write_dataset(tbl, tmp_path) + + class FakeIndex: + centers = np.random.randn(4, 16).astype(np.float32) + pq_centers = np.random.randn(4, 256, 4).astype(np.float32) + + class FakeIvfPqModule: + class IndexParams: + def __init__(self, **kwargs): + self.kwargs = kwargs + + @staticmethod + def build(build_params, matrix): + assert build_params.kwargs["n_lists"] == 4 + assert matrix.shape[1] == 16 + assert matrix.dtype == np.float32 + return FakeIndex() + + monkeypatch.setattr(lance_cuvs, "_require_cuvs", lambda: FakeIvfPqModule()) + + centroids, pq_codebook = lance_cuvs.train_ivf_pq_on_cuvs( + dataset, + "vector", + 4, + "L2", + "cuvs", + 4, + sample_rate=4, + ) + + assert centroids.shape == (4, 16) + assert pq_codebook.shape == (4, 256, 4) + + +def test_cuvs_as_numpy_prefers_copy_to_host(): + class FakeDeviceTensor: + def copy_to_host(self): + return np.arange(6, dtype=np.float32).reshape(2, 3) + + array = lance_cuvs._as_numpy(FakeDeviceTensor()) + + assert isinstance(array, np.ndarray) + assert array.shape == (2, 3) + assert array.dtype == np.float32 + + def test_use_index(dataset, tmp_path): ann_ds = lance.write_dataset(dataset.to_table(), tmp_path / "indexed.lance") ann_ds = ann_ds.create_index( From 76995c0357d5f3e6c1dc55c4cfc26ac52a012dd7 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Wed, 1 Apr 2026 17:03:47 +0800 Subject: [PATCH 04/21] python: format cuvs helper --- python/python/lance/cuvs.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py index 0bfa910cb21..5c58af9552b 100644 --- a/python/python/lance/cuvs.py +++ b/python/python/lance/cuvs.py @@ -127,7 +127,9 @@ def _estimate_trainset_fraction( return min(1.0, desired_rows / num_rows) -def _sample_training_table(dataset, column: str, train_rows: int, filt: str | None) -> pa.Table: +def _sample_training_table( + dataset, column: str, train_rows: int, filt: str | None +) -> pa.Table: if filt is None: return dataset.sample(train_rows, columns=[column], randomize_order=True) From fbe0f50faf39f3c92365b534697f00dca0c4fbe6 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Wed, 1 Apr 2026 17:12:33 +0800 Subject: [PATCH 05/21] python: clarify accelerator hardware requirements --- python/python/lance/dataset.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index cda142f5bb4..baa9890daf7 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -3266,8 +3266,13 @@ def create_index( Accepted accelerator: - "cuda" or ``torch.device(...)`` for the existing torch-based path + on NVIDIA GPUs - "mps" for Apple Silicon GPU - - "cuvs" for the explicit cuVS-based IVF_PQ training path + - "cuvs" for the explicit cuVS-based IVF_PQ training path on NVIDIA + GPUs + + The cuVS path also requires the cuVS Python bindings to be installed + separately. If not set, use the CPU. index_cache_size : int, optional @@ -3377,10 +3382,13 @@ def create_index( Experimental Accelerator (GPU) support: - *accelerate*: use GPU to train IVF partitions. - `accelerator="cuda"` and `accelerator="mps"` use the existing torch path. - `accelerator="cuvs"` uses cuVS for IVF_PQ training only. - The torch path requires PyTorch. The cuVS path requires the cuVS Python - bindings to be installed separately. + `accelerator="cuda"` and `accelerator="mps"` use the existing torch + path. `accelerator="cuda"` runs on NVIDIA GPUs and `accelerator="mps"` + runs on Apple Silicon GPUs. `accelerator="cuvs"` uses cuVS for IVF_PQ + training only and requires an NVIDIA GPU. + + The torch path requires PyTorch. The cuVS path requires the cuVS + Python bindings to be installed separately. .. code-block:: python From f00d0783e221b27e05a8a4331f2af4d76ea93c2a Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Wed, 1 Apr 2026 19:15:25 +0800 Subject: [PATCH 06/21] python: add cuvs one-pass ivfpq assignment --- python/python/lance/cuvs.py | 248 ++++++++++++++++++++++- python/python/lance/dataset.py | 28 ++- python/python/tests/test_vector_index.py | 70 +++++++ 3 files changed, 337 insertions(+), 9 deletions(-) diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py index 5c58af9552b..45142b6b774 100644 --- a/python/python/lance/cuvs.py +++ b/python/python/lance/cuvs.py @@ -3,13 +3,20 @@ from __future__ import annotations +import re +import tempfile from importlib import import_module -from typing import Tuple +from typing import TYPE_CHECKING, Iterator, Tuple import pyarrow as pa import pyarrow.compute as pc from .dependencies import numpy as np +from .log import LOGGER +from .util import _normalize_metric_type + +if TYPE_CHECKING: + from pathlib import Path def is_cuvs_accelerator(accelerator: object) -> bool: @@ -34,8 +41,33 @@ def _optional_cupy(): return None +def _xp_module(): + cupy = _optional_cupy() + return cupy if cupy is not None else np + + +def _make_progress(total: int): + try: + from tqdm.auto import tqdm + + return tqdm(total=total) + except ModuleNotFoundError: + + class _NoOpProgress: + def set_description(self, _description: str): + return None + + def update(self, _count: int): + return None + + def close(self): + return None + + return _NoOpProgress() + + def _metric_to_cuvs(metric_type: str) -> str: - metric_type = metric_type.lower() + metric_type = _normalize_metric_type(metric_type).lower() if metric_type in {"l2", "euclidean"}: return "sqeuclidean" if metric_type == "dot": @@ -45,12 +77,7 @@ def _metric_to_cuvs(metric_type: str) -> str: raise ValueError(f"Metric '{metric_type}' is not supported by cuVS IVF_PQ") -def _column_to_numpy(table: pa.Table, column: str) -> np.ndarray: - array = table.column(column).combine_chunks() - values = array.to_pylist() - if len(values) == 0: - raise ValueError("cuVS training requires at least one training vector") - matrix = np.asarray(values) +def _coerce_float_matrix(matrix: np.ndarray, *, column: str) -> np.ndarray: if matrix.ndim != 2: raise ValueError( f"Expected a 2D training matrix for column '{column}', got {matrix.shape}" @@ -62,6 +89,22 @@ def _column_to_numpy(table: pa.Table, column: str) -> np.ndarray: return matrix +def _column_to_numpy(table: pa.Table | pa.RecordBatch, column: str) -> np.ndarray: + array = table.column(column) + if isinstance(array, pa.ChunkedArray): + array = array.combine_chunks() + if len(array) == 0: + raise ValueError("cuVS training requires at least one training vector") + + if pa.types.is_fixed_size_list(array.type): + values = array.values.to_numpy(zero_copy_only=False) + matrix = values.reshape(len(array), array.type.list_size) + return _coerce_float_matrix(matrix, column=column) + + values = array.to_pylist() + return _coerce_float_matrix(np.asarray(values), column=column) + + def _as_numpy(array_like) -> np.ndarray: if isinstance(array_like, np.ndarray): return array_like @@ -143,6 +186,195 @@ def _sample_training_table( return dataset.to_table(columns=[column], filter=filt, limit=train_rows) +def _normalize_metric(metric_type: str) -> str: + return _normalize_metric_type(metric_type).lower() + + +def _backend_asarray(array_like, xp): + if xp is np: + return np.asarray(array_like) + return xp.asarray(array_like) + + +def _backend_to_numpy(array_like, xp) -> np.ndarray: + if xp is np: + return np.asarray(array_like) + return xp.asnumpy(array_like) + + +def _normalize_rows(matrix, xp): + eps = xp.finfo(matrix.dtype).eps + norms = xp.linalg.norm(matrix, axis=1, keepdims=True) + return matrix / xp.maximum(norms, eps) + + +def _argmin_distance(vectors, centroids, metric_type: str, xp): + if vectors.shape[0] == 0: + return xp.empty((0,), dtype=xp.int32) + + metric_type = _normalize_metric(metric_type) + if metric_type in {"l2", "euclidean"}: + vec_norms = xp.sum(vectors * vectors, axis=1, keepdims=True) + ctr_norms = xp.sum(centroids * centroids, axis=1, keepdims=False) + distances = vec_norms + ctr_norms - 2 * vectors @ centroids.T + return xp.argmin(distances, axis=1).astype(xp.int32, copy=False) + + if metric_type == "dot": + scores = vectors @ centroids.T + return xp.argmax(scores, axis=1).astype(xp.int32, copy=False) + + if metric_type == "cosine": + scores = _normalize_rows(vectors, xp) @ _normalize_rows(centroids, xp).T + return xp.argmax(scores, axis=1).astype(xp.int32, copy=False) + + raise ValueError(f"Metric '{metric_type}' is not supported by cuVS IVF_PQ") + + +def _encode_pq_codes(residuals, pq_codebook, metric_type: str, xp) -> np.ndarray: + num_rows, num_sub_vectors, _ = residuals.shape + codes = np.empty((num_rows, num_sub_vectors), dtype=np.uint8) + for subvector_idx in range(num_sub_vectors): + sub_vectors = residuals[:, subvector_idx, :] + sub_codebook = pq_codebook[subvector_idx] + nearest = _argmin_distance(sub_vectors, sub_codebook, metric_type, xp) + codes[:, subvector_idx] = _backend_to_numpy(nearest, xp).astype( + np.uint8, copy=False + ) + return codes + + +def _make_shuffle_batch( + row_ids: np.ndarray, + partitions: np.ndarray, + pq_codes: np.ndarray, + num_sub_vectors: int, +) -> pa.RecordBatch: + pq_values = pa.array(pq_codes.reshape(-1)) + pq_code_array = pa.FixedSizeListArray.from_arrays(pq_values, num_sub_vectors) + return pa.RecordBatch.from_arrays( + [ + pa.array(row_ids, type=pa.uint64()), + pa.array(partitions, type=pa.uint32()), + pq_code_array, + ], + schema=pa.schema( + [ + pa.field("row_id", pa.uint64()), + pa.field("__ivf_part_id", pa.uint32()), + pa.field("__pq_code", pa.list_(pa.uint8(), list_size=num_sub_vectors)), + ] + ), + ) + + +def one_pass_assign_ivf_pq_on_cuvs( + dataset, + column: str, + metric_type: str, + accelerator: str, + ivf_centroids: np.ndarray, + pq_codebook: np.ndarray, + dst_dataset_uri: str | Path | None = None, + batch_size: int = 1024 * 10 * 4, + *, + filter_nan: bool = True, +): + if accelerator != "cuvs": + raise ValueError("cuVS acceleration only supports accelerator='cuvs'") + + num_rows = dataset.count_rows() + if dataset.schema.field(column).nullable and filter_nan: + filt = f"{column} is not null" + else: + filt = None + + num_sub_vectors = pq_codebook.shape[0] + subvector_size = pq_codebook.shape[2] + dim = ivf_centroids.shape[1] + if dim != num_sub_vectors * subvector_size: + raise ValueError( + "cuVS returned incompatible IVF/PQ dimensions: " + f"centroids dim {dim} != {num_sub_vectors} * {subvector_size}" + ) + + xp = _xp_module() + backend_centroids = _backend_asarray(ivf_centroids, xp) + backend_codebook = _backend_asarray(pq_codebook, xp) + + progress = _make_progress(num_rows) + progress.set_description("Assigning partitions and computing pq codes") + + def _partition_and_pq_codes_assignment() -> Iterator[pa.RecordBatch]: + for batch in dataset.to_batches( + columns=[column], + filter=filt, + with_row_id=True, + batch_size=batch_size, + ): + vectors = _column_to_numpy(batch, column) + row_ids = batch.column("_rowid").to_numpy() + valid_mask = np.isfinite(vectors).all(axis=1) + if not np.all(valid_mask): + LOGGER.warning( + "%s vectors are ignored during partition assignment", + len(valid_mask) - int(valid_mask.sum()), + ) + row_ids = row_ids[valid_mask] + vectors = vectors[valid_mask] + if len(row_ids) == 0: + continue + backend_vectors = _backend_asarray(vectors, xp) + + partitions = _argmin_distance( + backend_vectors, backend_centroids, metric_type, xp + ) + selected_centroids = backend_centroids[partitions] + residuals = backend_vectors - selected_centroids + residuals = residuals.reshape(-1, num_sub_vectors, subvector_size) + pq_codes = _encode_pq_codes(residuals, backend_codebook, metric_type, xp) + + partition_batch = _make_shuffle_batch( + row_ids, + _backend_to_numpy(partitions, xp), + pq_codes, + num_sub_vectors, + ) + progress.update(partition_batch.num_rows) + yield partition_batch + + output_schema = pa.schema( + [ + pa.field("row_id", pa.uint64()), + pa.field("__ivf_part_id", pa.uint32()), + pa.field("__pq_code", pa.list_(pa.uint8(), list_size=num_sub_vectors)), + ] + ) + rbr = pa.RecordBatchReader.from_batches( + output_schema, _partition_and_pq_codes_assignment() + ) + if dst_dataset_uri is None: + dst_dataset_uri = tempfile.mkdtemp() + if re.search(r".:\\", dst_dataset_uri) is not None: + dst_dataset_uri = dst_dataset_uri.replace("\\", "/", 1) + + from . import write_dataset + + ds = write_dataset( + rbr, + dst_dataset_uri, + schema=output_schema, + data_storage_version="legacy", + ) + + progress.close() + LOGGER.info("Saved precomputed pq_codes to %s", dst_dataset_uri) + + shuffle_buffers = [ + data_file.path for frag in ds.get_fragments() for data_file in frag.data_files() + ] + return str(dst_dataset_uri), shuffle_buffers + + def train_ivf_pq_on_cuvs( dataset, column: str, diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index baa9890daf7..485f4425694 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -2960,7 +2960,10 @@ def _create_index_impl( ) if use_cuvs: - from .cuvs import one_pass_train_ivf_pq_on_cuvs + from .cuvs import ( + one_pass_assign_ivf_pq_on_cuvs, + one_pass_train_ivf_pq_on_cuvs, + ) LOGGER.info("Doing one-pass ivfpq cuVS training") timers["ivf+pq_train:start"] = time.time() @@ -2981,6 +2984,29 @@ def _create_index_impl( timers["ivf+pq_train:end"] - timers["ivf+pq_train:start"] ) LOGGER.info("cuVS ivf+pq training time: %ss", ivfpq_train_time) + timers["ivf+pq_assign:start"] = time.time() + ( + shuffle_output_dir, + shuffle_buffers, + ) = one_pass_assign_ivf_pq_on_cuvs( + self, + column[0], + metric, + accelerator, + ivf_centroids, + pq_codebook, + batch_size=20480, + filter_nan=filter_nan, + ) + timers["ivf+pq_assign:end"] = time.time() + ivfpq_assign_time = ( + timers["ivf+pq_assign:end"] - timers["ivf+pq_assign:start"] + ) + LOGGER.info("cuVS ivf+pq transform time: %ss", ivfpq_assign_time) + kwargs["precomputed_shuffle_buffers"] = shuffle_buffers + kwargs["precomputed_shuffle_buffers_path"] = os.path.join( + shuffle_output_dir, "data" + ) else: from .vector import ( one_pass_assign_ivf_pq_on_accelerator, diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index b92952e5f3c..7b49137e7a1 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -572,7 +572,48 @@ def fake_train( ), ) + def fake_assign( + dataset_arg, + column, + metric_type, + accelerator, + ivf_centroids, + pq_codebook, + dst_dataset_uri=None, + batch_size=20480, + *, + filter_nan, + ): + calls["assign_dataset"] = dataset_arg + calls["assign_column"] = column + calls["assign_metric_type"] = metric_type + calls["assign_accelerator"] = accelerator + calls["assign_batch_size"] = batch_size + calls["assign_filter_nan"] = filter_nan + + row_ids = dataset_arg.to_table(columns=[], with_row_id=True)[ + "_rowid" + ].to_numpy() + part_ids = pa.array(np.zeros(len(row_ids), dtype=np.uint32)) + pq_values = pa.array(np.zeros(len(row_ids) * 16, dtype=np.uint8)) + pq_codes = pa.FixedSizeListArray.from_arrays(pq_values, 16) + shuffle_ds_uri = str(tmp_path / "cuvs_shuffle_buffers") + shuffle_ds = lance.write_dataset( + pa.Table.from_arrays( + [pa.array(row_ids), part_ids, pq_codes], + names=["row_id", "__ivf_part_id", "__pq_code"], + ), + shuffle_ds_uri, + ) + shuffle_buffers = [ + data_file.path + for frag in shuffle_ds.get_fragments() + for data_file in frag.data_files() + ] + return shuffle_ds_uri, shuffle_buffers + monkeypatch.setattr(lance_cuvs, "one_pass_train_ivf_pq_on_cuvs", fake_train) + monkeypatch.setattr(lance_cuvs, "one_pass_assign_ivf_pq_on_cuvs", fake_assign) dataset = dataset.create_index( "vector", @@ -587,6 +628,9 @@ def fake_train( assert calls["metric_type"] == "L2" assert calls["accelerator"] == "cuvs" assert calls["num_sub_vectors"] == 16 + assert calls["assign_column"] == "vector" + assert calls["assign_metric_type"] == "L2" + assert calls["assign_accelerator"] == "cuvs" assert dataset.stats.index_stats("vector_idx")["index_type"] == "IVF_PQ" @@ -703,6 +747,32 @@ def copy_to_host(self): assert array.dtype == np.float32 +def test_one_pass_assign_ivf_pq_on_cuvs_writes_shuffle_buffers(tmp_path): + tbl = create_table(nvec=32, ndim=16) + dataset = lance.write_dataset(tbl, tmp_path / "cuvs_assign_src") + + ivf_centroids = np.random.randn(4, 16).astype(np.float32) + pq_codebook = np.random.randn(4, 256, 4).astype(np.float32) + + shuffle_uri, shuffle_buffers = lance_cuvs.one_pass_assign_ivf_pq_on_cuvs( + dataset, + "vector", + "l2", + "cuvs", + ivf_centroids, + pq_codebook, + batch_size=8, + ) + + shuffle_ds = lance.dataset(shuffle_uri) + batch = next(shuffle_ds.to_batches()) + + assert len(shuffle_buffers) > 0 + assert batch.column("row_id").type == pa.uint64() + assert batch.column("__ivf_part_id").type == pa.uint32() + assert batch.column("__pq_code").type == pa.list_(pa.uint8(), 4) + + def test_use_index(dataset, tmp_path): ann_ds = lance.write_dataset(dataset.to_table(), tmp_path / "indexed.lance") ann_ds = ann_ds.create_index( From 7f7e6e2e3829b45c9324ae7014c314489879eaab Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Wed, 1 Apr 2026 23:31:24 +0800 Subject: [PATCH 07/21] python: use cuvs transform for full ivf pq build --- python/python/lance/cuvs.py | 219 ++++++++++------------- python/python/lance/dataset.py | 5 +- python/python/tests/test_vector_index.py | 45 ++++- 3 files changed, 142 insertions(+), 127 deletions(-) diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py index 45142b6b774..8498be7720b 100644 --- a/python/python/lance/cuvs.py +++ b/python/python/lance/cuvs.py @@ -41,11 +41,6 @@ def _optional_cupy(): return None -def _xp_module(): - cupy = _optional_cupy() - return cupy if cupy is not None else np - - def _make_progress(total: int): try: from tqdm.auto import tqdm @@ -129,6 +124,16 @@ def _as_numpy(array_like) -> np.ndarray: raise TypeError("Unable to convert cuVS output to numpy") +def _to_cuvs_transform_input(matrix: np.ndarray): + cupy = _optional_cupy() + if cupy is None: + raise ModuleNotFoundError( + "accelerator='cuvs' full index build requires the 'cupy' package " + "to pass transform batches in device memory" + ) + return cupy.asarray(matrix) + + def _normalize_centroids(index, num_partitions: int, dimension: int) -> np.ndarray: centroids = _as_numpy(index.centers) if centroids.shape != (num_partitions, dimension): @@ -186,63 +191,6 @@ def _sample_training_table( return dataset.to_table(columns=[column], filter=filt, limit=train_rows) -def _normalize_metric(metric_type: str) -> str: - return _normalize_metric_type(metric_type).lower() - - -def _backend_asarray(array_like, xp): - if xp is np: - return np.asarray(array_like) - return xp.asarray(array_like) - - -def _backend_to_numpy(array_like, xp) -> np.ndarray: - if xp is np: - return np.asarray(array_like) - return xp.asnumpy(array_like) - - -def _normalize_rows(matrix, xp): - eps = xp.finfo(matrix.dtype).eps - norms = xp.linalg.norm(matrix, axis=1, keepdims=True) - return matrix / xp.maximum(norms, eps) - - -def _argmin_distance(vectors, centroids, metric_type: str, xp): - if vectors.shape[0] == 0: - return xp.empty((0,), dtype=xp.int32) - - metric_type = _normalize_metric(metric_type) - if metric_type in {"l2", "euclidean"}: - vec_norms = xp.sum(vectors * vectors, axis=1, keepdims=True) - ctr_norms = xp.sum(centroids * centroids, axis=1, keepdims=False) - distances = vec_norms + ctr_norms - 2 * vectors @ centroids.T - return xp.argmin(distances, axis=1).astype(xp.int32, copy=False) - - if metric_type == "dot": - scores = vectors @ centroids.T - return xp.argmax(scores, axis=1).astype(xp.int32, copy=False) - - if metric_type == "cosine": - scores = _normalize_rows(vectors, xp) @ _normalize_rows(centroids, xp).T - return xp.argmax(scores, axis=1).astype(xp.int32, copy=False) - - raise ValueError(f"Metric '{metric_type}' is not supported by cuVS IVF_PQ") - - -def _encode_pq_codes(residuals, pq_codebook, metric_type: str, xp) -> np.ndarray: - num_rows, num_sub_vectors, _ = residuals.shape - codes = np.empty((num_rows, num_sub_vectors), dtype=np.uint8) - for subvector_idx in range(num_sub_vectors): - sub_vectors = residuals[:, subvector_idx, :] - sub_codebook = pq_codebook[subvector_idx] - nearest = _argmin_distance(sub_vectors, sub_codebook, metric_type, xp) - codes[:, subvector_idx] = _backend_to_numpy(nearest, xp).astype( - np.uint8, copy=False - ) - return codes - - def _make_shuffle_batch( row_ids: np.ndarray, partitions: np.ndarray, @@ -267,6 +215,65 @@ def _make_shuffle_batch( ) +def _train_ivf_pq_index_on_cuvs( + dataset, + column: str, + num_partitions: int, + metric_type: str, + accelerator: str, + num_sub_vectors: int, + *, + sample_rate: int = 256, + max_iters: int = 50, + num_bits: int = 8, + filter_nan: bool = True, +): + if accelerator != "cuvs": + raise ValueError("cuVS acceleration only supports accelerator='cuvs'") + if num_bits != 8: + raise ValueError("cuVS IVF_PQ integration currently supports only num_bits=8") + + dimension = dataset.schema.field(column).type.list_size + if dimension % num_sub_vectors != 0: + raise ValueError( + "cuVS IVF_PQ integration requires vector dimension to be divisible by " + "num_sub_vectors" + ) + + if dataset.schema.field(column).nullable and filter_nan: + filt = f"{column} is not null" + else: + filt = None + + num_rows = dataset.count_rows(filter=filt) + if num_rows == 0: + raise ValueError("cuVS training requires at least one non-null training vector") + + train_rows = max(1, min(num_rows, max(num_partitions * sample_rate, 256 * 256))) + trainset = _sample_training_table(dataset, column, train_rows, filt) + matrix = _column_to_numpy(trainset, column) + + ivf_pq = _require_cuvs() + build_params = ivf_pq.IndexParams( + n_lists=num_partitions, + metric=_metric_to_cuvs(metric_type), + kmeans_n_iters=max_iters, + kmeans_trainset_fraction=_estimate_trainset_fraction( + matrix.shape[0], num_partitions, sample_rate + ), + pq_bits=num_bits, + pq_dim=num_sub_vectors, + codebook_kind="subspace", + force_random_rotation=False, + add_data_on_build=False, + ) + + index = ivf_pq.build(build_params, matrix) + centroids = _normalize_centroids(index, num_partitions, dimension) + pq_codebook = _normalize_pq_codebook(index, num_sub_vectors, num_bits, dimension) + return index, centroids, pq_codebook + + def one_pass_assign_ivf_pq_on_cuvs( dataset, column: str, @@ -274,6 +281,7 @@ def one_pass_assign_ivf_pq_on_cuvs( accelerator: str, ivf_centroids: np.ndarray, pq_codebook: np.ndarray, + trained_index=None, dst_dataset_uri: str | Path | None = None, batch_size: int = 1024 * 10 * 4, *, @@ -289,18 +297,14 @@ def one_pass_assign_ivf_pq_on_cuvs( filt = None num_sub_vectors = pq_codebook.shape[0] - subvector_size = pq_codebook.shape[2] - dim = ivf_centroids.shape[1] - if dim != num_sub_vectors * subvector_size: + ivf_pq = _require_cuvs() + + if trained_index is None: raise ValueError( - "cuVS returned incompatible IVF/PQ dimensions: " - f"centroids dim {dim} != {num_sub_vectors} * {subvector_size}" + "one_pass_assign_ivf_pq_on_cuvs requires a trained cuVS index for " + "single-node transform" ) - xp = _xp_module() - backend_centroids = _backend_asarray(ivf_centroids, xp) - backend_codebook = _backend_asarray(pq_codebook, xp) - progress = _make_progress(num_rows) progress.set_description("Assigning partitions and computing pq codes") @@ -323,19 +327,20 @@ def _partition_and_pq_codes_assignment() -> Iterator[pa.RecordBatch]: vectors = vectors[valid_mask] if len(row_ids) == 0: continue - backend_vectors = _backend_asarray(vectors, xp) - - partitions = _argmin_distance( - backend_vectors, backend_centroids, metric_type, xp + partitions, pq_codes = ivf_pq.transform( + trained_index, _to_cuvs_transform_input(vectors) ) - selected_centroids = backend_centroids[partitions] - residuals = backend_vectors - selected_centroids - residuals = residuals.reshape(-1, num_sub_vectors, subvector_size) - pq_codes = _encode_pq_codes(residuals, backend_codebook, metric_type, xp) + partitions = _as_numpy(partitions).astype(np.uint32, copy=False) + pq_codes = _as_numpy(pq_codes).astype(np.uint8, copy=False) + if pq_codes.shape != (len(row_ids), num_sub_vectors): + raise ValueError( + "cuVS transform returned incompatible PQ codes shape: " + f"expected {(len(row_ids), num_sub_vectors)}, got {pq_codes.shape}" + ) partition_batch = _make_shuffle_batch( row_ids, - _backend_to_numpy(partitions, xp), + partitions, pq_codes, num_sub_vectors, ) @@ -388,50 +393,18 @@ def train_ivf_pq_on_cuvs( num_bits: int = 8, filter_nan: bool = True, ) -> Tuple[np.ndarray, np.ndarray]: - if accelerator != "cuvs": - raise ValueError("cuVS acceleration only supports accelerator='cuvs'") - if num_bits != 8: - raise ValueError("cuVS IVF_PQ integration currently supports only num_bits=8") - - dimension = dataset.schema.field(column).type.list_size - if dimension % num_sub_vectors != 0: - raise ValueError( - "cuVS IVF_PQ integration requires vector dimension to be divisible by " - "num_sub_vectors" - ) - - if dataset.schema.field(column).nullable and filter_nan: - filt = f"{column} is not null" - else: - filt = None - - num_rows = dataset.count_rows(filter=filt) - if num_rows == 0: - raise ValueError("cuVS training requires at least one non-null training vector") - - train_rows = max(1, min(num_rows, max(num_partitions * sample_rate, 256 * 256))) - trainset = _sample_training_table(dataset, column, train_rows, filt) - matrix = _column_to_numpy(trainset, column) - - ivf_pq = _require_cuvs() - build_params = ivf_pq.IndexParams( - n_lists=num_partitions, - metric=_metric_to_cuvs(metric_type), - kmeans_n_iters=max_iters, - kmeans_trainset_fraction=_estimate_trainset_fraction( - matrix.shape[0], num_partitions, sample_rate - ), - pq_bits=num_bits, - pq_dim=num_sub_vectors, - codebook_kind="subspace", - force_random_rotation=False, - add_data_on_build=False, + _, centroids, pq_codebook = _train_ivf_pq_index_on_cuvs( + dataset, + column, + num_partitions, + metric_type, + accelerator, + num_sub_vectors, + sample_rate=sample_rate, + max_iters=max_iters, + num_bits=num_bits, + filter_nan=filter_nan, ) - - index = ivf_pq.build(build_params, matrix) - - centroids = _normalize_centroids(index, num_partitions, dimension) - pq_codebook = _normalize_pq_codebook(index, num_sub_vectors, num_bits, dimension) return centroids, pq_codebook diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 485f4425694..1b24962a700 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -2961,13 +2961,13 @@ def _create_index_impl( if use_cuvs: from .cuvs import ( + _train_ivf_pq_index_on_cuvs, one_pass_assign_ivf_pq_on_cuvs, - one_pass_train_ivf_pq_on_cuvs, ) LOGGER.info("Doing one-pass ivfpq cuVS training") timers["ivf+pq_train:start"] = time.time() - ivf_centroids, pq_codebook = one_pass_train_ivf_pq_on_cuvs( + trained_index, ivf_centroids, pq_codebook = _train_ivf_pq_index_on_cuvs( self, column[0], num_partitions, @@ -2995,6 +2995,7 @@ def _create_index_impl( accelerator, ivf_centroids, pq_codebook, + trained_index=trained_index, batch_size=20480, filter_nan=filter_nan, ) diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 7b49137e7a1..a54475e1ae0 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -542,6 +542,10 @@ def test_create_index_cuvs_dispatch(tmp_path, monkeypatch): dataset = lance.write_dataset(tbl, tmp_path) calls = {} + class FakeIndex: + pq_dim = 16 + pq_bits = 8 + def fake_train( dataset_arg, column, @@ -566,6 +570,7 @@ def fake_train( calls["num_bits"] = num_bits calls["filter_nan"] = filter_nan return ( + FakeIndex(), np.random.randn(num_partitions, 128).astype(np.float32), np.random.randn(num_sub_vectors, 256, 128 // num_sub_vectors).astype( np.float32 @@ -579,6 +584,7 @@ def fake_assign( accelerator, ivf_centroids, pq_codebook, + trained_index=None, dst_dataset_uri=None, batch_size=20480, *, @@ -588,6 +594,7 @@ def fake_assign( calls["assign_column"] = column calls["assign_metric_type"] = metric_type calls["assign_accelerator"] = accelerator + calls["assign_trained_index"] = trained_index calls["assign_batch_size"] = batch_size calls["assign_filter_nan"] = filter_nan @@ -612,7 +619,7 @@ def fake_assign( ] return shuffle_ds_uri, shuffle_buffers - monkeypatch.setattr(lance_cuvs, "one_pass_train_ivf_pq_on_cuvs", fake_train) + monkeypatch.setattr(lance_cuvs, "_train_ivf_pq_index_on_cuvs", fake_train) monkeypatch.setattr(lance_cuvs, "one_pass_assign_ivf_pq_on_cuvs", fake_assign) dataset = dataset.create_index( @@ -631,6 +638,7 @@ def fake_assign( assert calls["assign_column"] == "vector" assert calls["assign_metric_type"] == "L2" assert calls["assign_accelerator"] == "cuvs" + assert isinstance(calls["assign_trained_index"], FakeIndex) assert dataset.stats.index_stats("vector_idx")["index_type"] == "IVF_PQ" @@ -747,13 +755,45 @@ def copy_to_host(self): assert array.dtype == np.float32 -def test_one_pass_assign_ivf_pq_on_cuvs_writes_shuffle_buffers(tmp_path): +def test_one_pass_assign_ivf_pq_on_cuvs_writes_shuffle_buffers(tmp_path, monkeypatch): tbl = create_table(nvec=32, ndim=16) dataset = lance.write_dataset(tbl, tmp_path / "cuvs_assign_src") ivf_centroids = np.random.randn(4, 16).astype(np.float32) pq_codebook = np.random.randn(4, 256, 4).astype(np.float32) + class FakeDeviceTensor: + def __init__(self, array): + self._array = array + + def copy_to_host(self): + return self._array + + class FakeCupyArray: + def __init__(self, array): + self.array = array + + class FakeCupyModule: + @staticmethod + def asarray(array): + return FakeCupyArray(array) + + class FakeIndex: + pq_dim = 4 + pq_bits = 8 + + class FakeIvfPqModule: + @staticmethod + def transform(index, vectors): + assert isinstance(index, FakeIndex) + assert isinstance(vectors, FakeCupyArray) + labels = np.arange(len(vectors.array), dtype=np.uint32) % 4 + pq_codes = np.full((len(vectors.array), 4), 7, dtype=np.uint8) + return FakeDeviceTensor(labels), FakeDeviceTensor(pq_codes) + + monkeypatch.setattr(lance_cuvs, "_require_cuvs", lambda: FakeIvfPqModule()) + monkeypatch.setattr(lance_cuvs, "_optional_cupy", lambda: FakeCupyModule()) + shuffle_uri, shuffle_buffers = lance_cuvs.one_pass_assign_ivf_pq_on_cuvs( dataset, "vector", @@ -761,6 +801,7 @@ def test_one_pass_assign_ivf_pq_on_cuvs_writes_shuffle_buffers(tmp_path): "cuvs", ivf_centroids, pq_codebook, + trained_index=FakeIndex(), batch_size=8, ) From f9c5d03d8f34e680bb54f03a626d219bcde62e8f Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Thu, 2 Apr 2026 17:04:49 +0800 Subject: [PATCH 08/21] python: route cuvs precomputed shuffle to v3 files --- python/python/lance/cuvs.py | 135 ++++++++++---- python/python/lance/dataset.py | 6 +- python/python/tests/test_vector_index.py | 16 +- rust/lance-index/src/vector/v3/shuffler.rs | 203 ++++++++++++++++++++- rust/lance/src/index/vector/builder.rs | 70 ++++++- 5 files changed, 376 insertions(+), 54 deletions(-) diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py index 8498be7720b..ed8cad83907 100644 --- a/python/python/lance/cuvs.py +++ b/python/python/lance/cuvs.py @@ -3,6 +3,8 @@ from __future__ import annotations +import json +import os import re import tempfile from importlib import import_module @@ -12,6 +14,7 @@ import pyarrow.compute as pc from .dependencies import numpy as np +from .file import LanceFileWriter from .log import LOGGER from .util import _normalize_metric_type @@ -195,24 +198,108 @@ def _make_shuffle_batch( row_ids: np.ndarray, partitions: np.ndarray, pq_codes: np.ndarray, + num_partitions: int, num_sub_vectors: int, -) -> pa.RecordBatch: - pq_values = pa.array(pq_codes.reshape(-1)) +) -> tuple[pa.RecordBatch, pa.RecordBatch]: + sort_indices = np.argsort(partitions, kind="stable") + row_ids = row_ids[sort_indices] + partitions = partitions[sort_indices] + pq_codes = pq_codes[sort_indices] + + pq_values = pa.array(pq_codes.reshape(-1), type=pa.uint8()) pq_code_array = pa.FixedSizeListArray.from_arrays(pq_values, num_sub_vectors) - return pa.RecordBatch.from_arrays( + partition_counts = np.bincount(partitions, minlength=num_partitions).astype( + np.uint64, copy=False + ) + offsets = np.cumsum(partition_counts, dtype=np.uint64) + data_batch = pa.RecordBatch.from_arrays( [ pa.array(row_ids, type=pa.uint64()), - pa.array(partitions, type=pa.uint32()), pq_code_array, ], schema=pa.schema( [ - pa.field("row_id", pa.uint64()), - pa.field("__ivf_part_id", pa.uint32()), + pa.field("_rowid", pa.uint64()), pa.field("__pq_code", pa.list_(pa.uint8(), list_size=num_sub_vectors)), ] ), ) + offsets_batch = pa.RecordBatch.from_arrays( + [pa.array(offsets, type=pa.uint64())], + schema=pa.schema([pa.field("offset", pa.uint64())]), + ) + return data_batch, offsets_batch + + +def _shuffle_metadata( + num_partitions: int, num_batches: int, partition_counts +) -> dict[str, str]: + return { + "lance:shuffle:num_partitions": str(num_partitions), + "lance:shuffle:num_batches": str(num_batches), + "lance:shuffle:partition_counts": json.dumps(list(partition_counts)), + "lance:shuffle:total_loss": "0.0", + } + + +def _write_v3_shuffle_files( + output_root: str, + batches: Iterator[tuple[pa.RecordBatch, pa.RecordBatch]], + *, + num_partitions: int, + num_sub_vectors: int, +) -> list[str]: + os.makedirs(output_root, exist_ok=True) + data_path = os.path.join(output_root, "shuffle_data.lance") + offsets_path = os.path.join(output_root, "shuffle_offsets.lance") + + data_schema = pa.schema( + [ + pa.field("_rowid", pa.uint64()), + pa.field("__pq_code", pa.list_(pa.uint8(), list_size=num_sub_vectors)), + ] + ) + offsets_schema = pa.schema([pa.field("offset", pa.uint64())]) + + data_writer = None + offsets_writer = LanceFileWriter(offsets_path, offsets_schema) + total_partition_counts = np.zeros(num_partitions, dtype=np.uint64) + global_row_count = np.uint64(0) + num_batches = 0 + + for data_batch, offsets_batch in batches: + if data_writer is None: + data_writer = LanceFileWriter(data_path, data_batch.schema) + data_writer.write_batch(data_batch) + + offsets = offsets_batch.column(0).to_numpy() + adjusted_offsets = offsets + global_row_count + offsets_writer.write_batch( + pa.RecordBatch.from_arrays( + [pa.array(adjusted_offsets, type=pa.uint64())], + schema=offsets_schema, + ) + ) + last_offset = np.uint64(0) + for idx, offset in enumerate(offsets): + total_partition_counts[idx] += np.uint64(offset) - last_offset + last_offset = np.uint64(offset) + global_row_count += np.uint64(data_batch.num_rows) + num_batches += 1 + + if data_writer is None: + data_writer = LanceFileWriter(data_path, data_schema) + + metadata = _shuffle_metadata( + num_partitions, num_batches, total_partition_counts.tolist() + ) + for key, value in metadata.items(): + data_writer.add_schema_metadata(key, value) + offsets_writer.add_schema_metadata(key, value) + + data_writer.close() + offsets_writer.close() + return ["shuffle_data.lance", "shuffle_offsets.lance"] def _train_ivf_pq_index_on_cuvs( @@ -283,7 +370,7 @@ def one_pass_assign_ivf_pq_on_cuvs( pq_codebook: np.ndarray, trained_index=None, dst_dataset_uri: str | Path | None = None, - batch_size: int = 1024 * 10 * 4, + batch_size: int = 1024 * 128, *, filter_nan: bool = True, ): @@ -308,7 +395,9 @@ def one_pass_assign_ivf_pq_on_cuvs( progress = _make_progress(num_rows) progress.set_description("Assigning partitions and computing pq codes") - def _partition_and_pq_codes_assignment() -> Iterator[pa.RecordBatch]: + def _partition_and_pq_codes_assignment() -> Iterator[ + tuple[pa.RecordBatch, pa.RecordBatch] + ]: for batch in dataset.to_batches( columns=[column], filter=filt, @@ -342,41 +431,25 @@ def _partition_and_pq_codes_assignment() -> Iterator[pa.RecordBatch]: row_ids, partitions, pq_codes, + ivf_centroids.shape[0], num_sub_vectors, ) - progress.update(partition_batch.num_rows) + progress.update(len(row_ids)) yield partition_batch - output_schema = pa.schema( - [ - pa.field("row_id", pa.uint64()), - pa.field("__ivf_part_id", pa.uint32()), - pa.field("__pq_code", pa.list_(pa.uint8(), list_size=num_sub_vectors)), - ] - ) - rbr = pa.RecordBatchReader.from_batches( - output_schema, _partition_and_pq_codes_assignment() - ) if dst_dataset_uri is None: dst_dataset_uri = tempfile.mkdtemp() if re.search(r".:\\", dst_dataset_uri) is not None: dst_dataset_uri = dst_dataset_uri.replace("\\", "/", 1) - - from . import write_dataset - - ds = write_dataset( - rbr, - dst_dataset_uri, - schema=output_schema, - data_storage_version="legacy", + shuffle_buffers = _write_v3_shuffle_files( + str(dst_dataset_uri), + _partition_and_pq_codes_assignment(), + num_partitions=ivf_centroids.shape[0], + num_sub_vectors=num_sub_vectors, ) progress.close() LOGGER.info("Saved precomputed pq_codes to %s", dst_dataset_uri) - - shuffle_buffers = [ - data_file.path for frag in ds.get_fragments() for data_file in frag.data_files() - ] return str(dst_dataset_uri), shuffle_buffers diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 1b24962a700..5dee7767918 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -2996,7 +2996,7 @@ def _create_index_impl( ivf_centroids, pq_codebook, trained_index=trained_index, - batch_size=20480, + batch_size=1024 * 128, filter_nan=filter_nan, ) timers["ivf+pq_assign:end"] = time.time() @@ -3005,9 +3005,7 @@ def _create_index_impl( ) LOGGER.info("cuVS ivf+pq transform time: %ss", ivfpq_assign_time) kwargs["precomputed_shuffle_buffers"] = shuffle_buffers - kwargs["precomputed_shuffle_buffers_path"] = os.path.join( - shuffle_output_dir, "data" - ) + kwargs["precomputed_shuffle_buffers_path"] = shuffle_output_dir else: from .vector import ( one_pass_assign_ivf_pq_on_accelerator, diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index a54475e1ae0..cf0bfe6e2bf 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -805,13 +805,17 @@ def transform(index, vectors): batch_size=8, ) - shuffle_ds = lance.dataset(shuffle_uri) - batch = next(shuffle_ds.to_batches()) + from lance.file import LanceFileReader - assert len(shuffle_buffers) > 0 - assert batch.column("row_id").type == pa.uint64() - assert batch.column("__ivf_part_id").type == pa.uint32() - assert batch.column("__pq_code").type == pa.list_(pa.uint8(), 4) + data_reader = LanceFileReader(str(Path(shuffle_uri) / "shuffle_data.lance")) + offsets_reader = LanceFileReader(str(Path(shuffle_uri) / "shuffle_offsets.lance")) + data_batch = next(data_reader.read_all(batch_size=1024).to_batches()) + offsets_batch = next(offsets_reader.read_all(batch_size=1024).to_batches()) + + assert shuffle_buffers == ["shuffle_data.lance", "shuffle_offsets.lance"] + assert data_batch.column("_rowid").type == pa.uint64() + assert data_batch.column("__pq_code").type == pa.list_(pa.uint8(), 4) + assert offsets_batch.column("offset").type == pa.uint64() def test_use_index(dataset, tmp_path): diff --git a/rust/lance-index/src/vector/v3/shuffler.rs b/rust/lance-index/src/vector/v3/shuffler.rs index 0bf714df237..45c719d523a 100644 --- a/rust/lance-index/src/vector/v3/shuffler.rs +++ b/rust/lance-index/src/vector/v3/shuffler.rs @@ -4,6 +4,7 @@ //! Shuffler is a component that takes a stream of record batches and shuffles them into //! the corresponding IVF partitions. +use std::collections::HashMap; use std::ops::Range; use std::sync::atomic::AtomicU64; use std::sync::{Arc, Mutex}; @@ -36,6 +37,13 @@ use object_store::path::Path; use crate::vector::{LOSS_METADATA_KEY, PART_ID_COLUMN}; +const SHUFFLE_NUM_PARTITIONS_METADATA_KEY: &str = "lance:shuffle:num_partitions"; +const SHUFFLE_NUM_BATCHES_METADATA_KEY: &str = "lance:shuffle:num_batches"; +const SHUFFLE_PARTITION_COUNTS_METADATA_KEY: &str = "lance:shuffle:partition_counts"; +const SHUFFLE_TOTAL_LOSS_METADATA_KEY: &str = "lance:shuffle:total_loss"; +pub const SHUFFLE_DATA_FILE_NAME: &str = "shuffle_data.lance"; +pub const SHUFFLE_OFFSETS_FILE_NAME: &str = "shuffle_offsets.lance"; + #[async_trait::async_trait] /// A reader that can read the shuffled partitions. pub trait ShuffleReader: Send + Sync { @@ -435,7 +443,7 @@ impl Shuffler for TwoFileShuffler { ); // Create data file writer - let data_path = self.output_dir.child("shuffle_data.lance"); + let data_path = self.output_dir.child(SHUFFLE_DATA_FILE_NAME); let spill_path = self.output_dir.child("shuffle_data.spill"); let writer = self.object_store.create(&data_path).await?; let mut file_writer = FileWriter::try_new( @@ -446,7 +454,7 @@ impl Shuffler for TwoFileShuffler { .with_page_metadata_spill(self.object_store.clone(), spill_path); // Create offsets file writer - let offsets_path = self.output_dir.child("shuffle_offsets.lance"); + let offsets_path = self.output_dir.child(SHUFFLE_OFFSETS_FILE_NAME); let spill_path = self.output_dir.child("shuffle_offsets.spill"); let writer = self.object_store.create(&offsets_path).await?; let mut offsets_writer = FileWriter::try_new( @@ -527,13 +535,37 @@ impl Shuffler for TwoFileShuffler { .await?; } + let partition_counts_json = serde_json::to_string(&partition_counts).map_err(|e| { + Error::invalid_input(format!("Failed to serialize shuffle partition counts: {e}")) + })?; + let num_partitions_str = num_partitions.to_string(); + let num_batches_str = num_batches + .load(std::sync::atomic::Ordering::Relaxed) + .to_string(); + let total_loss_str = total_loss.lock().unwrap().to_string(); + for writer in [&mut file_writer, &mut offsets_writer] { + writer.add_schema_metadata( + SHUFFLE_NUM_PARTITIONS_METADATA_KEY, + num_partitions_str.clone(), + ); + writer.add_schema_metadata(SHUFFLE_NUM_BATCHES_METADATA_KEY, num_batches_str.clone()); + writer.add_schema_metadata( + SHUFFLE_PARTITION_COUNTS_METADATA_KEY, + partition_counts_json.clone(), + ); + writer.add_schema_metadata(SHUFFLE_TOTAL_LOSS_METADATA_KEY, total_loss_str.clone()); + } + // Finish files file_writer.finish().await?; offsets_writer.finish().await?; - let num_batches = num_batches.load(std::sync::atomic::Ordering::Relaxed); - - let total_loss_val = *total_loss.lock().unwrap(); + let num_batches = num_batches_str + .parse::() + .expect("num_batches string was produced from u64"); + let total_loss_val = total_loss_str + .parse::() + .expect("total_loss string was produced from f64"); TwoFileShuffleReader::try_new( self.object_store.clone(), @@ -558,6 +590,46 @@ pub struct TwoFileShuffleReader { } impl TwoFileShuffleReader { + pub async fn try_open_existing( + object_store: Arc, + output_dir: Path, + data_file: impl AsRef, + offsets_file: impl AsRef, + ) -> Result> { + let scheduler_config = SchedulerConfig::max_bandwidth(&object_store); + let scheduler = ScanScheduler::new(object_store, scheduler_config); + + let file_reader = FileReader::try_open( + scheduler + .open_file( + &output_dir.child(data_file.as_ref()), + &CachedFileSize::unknown(), + ) + .await?, + None, + Arc::::default(), + &LanceCache::no_cache(), + FileReaderOptions::default(), + ) + .await?; + + let offsets_reader = FileReader::try_open( + scheduler + .open_file( + &output_dir.child(offsets_file.as_ref()), + &CachedFileSize::unknown(), + ) + .await?, + None, + Arc::::default(), + &LanceCache::no_cache(), + FileReaderOptions::default(), + ) + .await?; + + Self::from_existing_readers(scheduler, file_reader, offsets_reader) + } + async fn try_new( object_store: Arc, output_dir: Path, @@ -573,7 +645,7 @@ impl TwoFileShuffleReader { let scheduler_config = SchedulerConfig::max_bandwidth(&object_store); let scheduler = ScanScheduler::new(object_store, scheduler_config); - let data_path = output_dir.child("shuffle_data.lance"); + let data_path = output_dir.child(SHUFFLE_DATA_FILE_NAME); let file_reader = FileReader::try_open( scheduler .open_file(&data_path, &CachedFileSize::unknown()) @@ -585,7 +657,7 @@ impl TwoFileShuffleReader { ) .await?; - let offsets_path = output_dir.child("shuffle_offsets.lance"); + let offsets_path = output_dir.child(SHUFFLE_OFFSETS_FILE_NAME); let offsets_reader = FileReader::try_open( scheduler .open_file(&offsets_path, &CachedFileSize::unknown()) @@ -608,6 +680,87 @@ impl TwoFileShuffleReader { })) } + fn from_existing_readers( + scheduler: Arc, + file_reader: FileReader, + offsets_reader: FileReader, + ) -> Result> { + let metadata: &HashMap = &offsets_reader.schema().metadata; + + let num_partitions = metadata + .get(SHUFFLE_NUM_PARTITIONS_METADATA_KEY) + .ok_or_else(|| { + Error::invalid_input(format!( + "Missing required metadata key {SHUFFLE_NUM_PARTITIONS_METADATA_KEY} in precomputed V3 shuffle offsets file" + )) + })? + .parse::() + .map_err(|e| { + Error::invalid_input(format!( + "Invalid value for {SHUFFLE_NUM_PARTITIONS_METADATA_KEY}: {e}" + )) + })?; + let num_batches = metadata + .get(SHUFFLE_NUM_BATCHES_METADATA_KEY) + .ok_or_else(|| { + Error::invalid_input(format!( + "Missing required metadata key {SHUFFLE_NUM_BATCHES_METADATA_KEY} in precomputed V3 shuffle offsets file" + )) + })? + .parse::() + .map_err(|e| { + Error::invalid_input(format!( + "Invalid value for {SHUFFLE_NUM_BATCHES_METADATA_KEY}: {e}" + )) + })?; + let partition_counts = serde_json::from_str::>( + metadata + .get(SHUFFLE_PARTITION_COUNTS_METADATA_KEY) + .ok_or_else(|| { + Error::invalid_input(format!( + "Missing required metadata key {SHUFFLE_PARTITION_COUNTS_METADATA_KEY} in precomputed V3 shuffle offsets file" + )) + })?, + ) + .map_err(|e| { + Error::invalid_input(format!( + "Invalid value for {SHUFFLE_PARTITION_COUNTS_METADATA_KEY}: {e}" + )) + })?; + if partition_counts.len() != num_partitions { + return Err(Error::invalid_input(format!( + "Precomputed V3 shuffle partition count length {} does not match num_partitions {}", + partition_counts.len(), + num_partitions + ))); + } + let total_loss = metadata + .get(SHUFFLE_TOTAL_LOSS_METADATA_KEY) + .map(|value| { + value.parse::().map_err(|e| { + Error::invalid_input(format!( + "Invalid value for {SHUFFLE_TOTAL_LOSS_METADATA_KEY}: {e}" + )) + }) + }) + .transpose()? + .unwrap_or(0.0); + + if num_batches == 0 { + return Ok(Box::new(EmptyReader)); + } + + Ok(Box::new(Self { + _scheduler: scheduler, + file_reader, + offsets_reader, + num_partitions, + num_batches, + partition_counts, + total_loss, + })) + } + async fn partition_ranges(&self, partition_id: usize) -> Result>> { let mut positions = Vec::with_capacity(self.num_batches as usize * 2); for batch_idx in 0..self.num_batches { @@ -844,6 +997,42 @@ mod tests { assert!((loss - 4.25).abs() < 1e-10, "expected 4.25, got {}", loss); } + #[tokio::test] + async fn test_two_file_shuffler_reopen_existing_files() { + let dir = TempStrDir::default(); + let output_dir = Path::from(dir.as_ref()); + let num_partitions = 3; + + let batch1 = make_batch(&[0, 1, 2], &[10, 20, 30], Some(1.5)); + let batch2 = make_batch(&[2, 0, 1, 0], &[40, 50, 60, 70], Some(2.0)); + + let shuffler = TwoFileShuffler::new(output_dir.clone(), num_partitions); + let stream = batches_to_stream(vec![batch1, batch2]); + let _ = shuffler.shuffle(stream).await.unwrap(); + + let reopened = TwoFileShuffleReader::try_open_existing( + Arc::new(ObjectStore::local()), + output_dir, + SHUFFLE_DATA_FILE_NAME, + SHUFFLE_OFFSETS_FILE_NAME, + ) + .await + .unwrap(); + + assert_eq!(reopened.partition_size(0).unwrap(), 3); + assert_eq!(reopened.partition_size(1).unwrap(), 2); + assert_eq!(reopened.partition_size(2).unwrap(), 2); + + let p0 = collect_partition(reopened.as_ref(), 0).await.unwrap(); + let vals: &Int32Array = p0.column_by_name("val").unwrap().as_primitive(); + let mut v: Vec = vals.iter().map(|x| x.unwrap()).collect(); + v.sort(); + assert_eq!(v, vec![10, 50, 70]); + + let loss = reopened.total_loss().unwrap(); + assert!((loss - 3.5).abs() < 1e-10, "expected 3.5, got {}", loss); + } + #[tokio::test] async fn test_two_file_shuffler_single_batch() { let dir = TempStrDir::default(); diff --git a/rust/lance/src/index/vector/builder.rs b/rust/lance/src/index/vector/builder.rs index 24298cbba18..9a7001834d0 100644 --- a/rust/lance/src/index/vector/builder.rs +++ b/rust/lance/src/index/vector/builder.rs @@ -3,6 +3,7 @@ use std::collections::HashSet; use std::future; +use std::path::Path as StdPath; use std::sync::Arc; use std::{collections::HashMap, pin::Pin}; @@ -44,7 +45,10 @@ use lance_index::vector::quantizer::{QuantizerMetadata, QuantizerStorage}; use lance_index::vector::shared::{SupportedIvfIndexType, write_unified_ivf_and_index_metadata}; use lance_index::vector::storage::STORAGE_METADATA_KEY; use lance_index::vector::transform::Flatten; -use lance_index::vector::v3::shuffler::{EmptyReader, IvfShufflerReader}; +use lance_index::vector::v3::shuffler::{ + EmptyReader, IvfShufflerReader, SHUFFLE_DATA_FILE_NAME, SHUFFLE_OFFSETS_FILE_NAME, + TwoFileShuffleReader, +}; use lance_index::vector::v3::subindex::SubIndexType; use lance_index::vector::{LOSS_METADATA_KEY, PART_ID_COLUMN, PQ_CODE_COLUMN, VectorIndex}; use lance_index::vector::{PART_ID_FIELD, ivf::storage::IvfModel}; @@ -141,6 +145,43 @@ type BuildStream = Pin::Storage, S, f64)>>> + Send>>; impl IvfIndexBuilder { + async fn try_open_precomputed_v3_shuffle_reader( + &self, + root: &Path, + files: &[String], + ) -> Result>> { + if files.len() != 2 { + return Ok(None); + } + + let mut data_file = None; + let mut offsets_file = None; + for file in files { + let Some(file_name) = StdPath::new(file).file_name() else { + return Ok(None); + }; + match file_name.to_string_lossy().as_ref() { + SHUFFLE_DATA_FILE_NAME => data_file = Some(SHUFFLE_DATA_FILE_NAME), + SHUFFLE_OFFSETS_FILE_NAME => offsets_file = Some(SHUFFLE_OFFSETS_FILE_NAME), + _ => return Ok(None), + } + } + let (Some(data_file), Some(offsets_file)) = (data_file, offsets_file) else { + return Ok(None); + }; + + Ok(Some( + TwoFileShuffleReader::try_open_existing( + Arc::new(ObjectStore::local()), + root.clone(), + data_file, + offsets_file, + ) + .await? + .into(), + )) + } + #[allow(clippy::too_many_arguments)] pub fn new( dataset: Dataset, @@ -528,13 +569,30 @@ impl IvfIndexBuilder .as_ref() .and_then(|p| p.precomputed_shuffle_buffers.as_ref()) { - Some((uri, _)) => { + Some((uri, files)) => { + if let Some(reader) = self + .try_open_precomputed_v3_shuffle_reader(uri, files) + .await? + { + log::info!("shuffle with precomputed V3 shuffle files from {}", uri); + self.shuffle_reader = Some(reader); + return Ok(()); + } + let uri = to_local_path(uri); - // the uri points to data directory, - // so need to trim the "data" suffix for reading the dataset - let uri = uri.trim_end_matches("data"); + let uri = if StdPath::new(&uri) + .file_name() + .is_some_and(|name| name == "data") + { + StdPath::new(&uri) + .parent() + .map(|path| path.to_string_lossy().to_string()) + .unwrap_or(uri) + } else { + uri + }; log::info!("shuffle with precomputed shuffle buffers from {}", uri); - let ds = Dataset::open(uri).await?; + let ds = Dataset::open(&uri).await?; ds.scan().try_into_stream().await? } _ => { From ec99cda4bafe9be41a38aff747107a6cb9ba2e7e Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Thu, 2 Apr 2026 22:17:50 +0800 Subject: [PATCH 09/21] python: fix cuvs pq_dim semantics --- python/python/lance/cuvs.py | 2 +- python/python/tests/test_vector_index.py | 37 ++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py index ed8cad83907..a3ad062d8fa 100644 --- a/python/python/lance/cuvs.py +++ b/python/python/lance/cuvs.py @@ -349,7 +349,7 @@ def _train_ivf_pq_index_on_cuvs( matrix.shape[0], num_partitions, sample_rate ), pq_bits=num_bits, - pq_dim=num_sub_vectors, + pq_dim=dimension // num_sub_vectors, codebook_kind="subspace", force_random_rotation=False, add_data_on_build=False, diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index cf0bfe6e2bf..3c7d92290b7 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -743,6 +743,43 @@ def build(build_params, matrix): assert pq_codebook.shape == (4, 256, 4) +def test_train_ivf_pq_on_cuvs_uses_subvector_dimension_for_pq_dim( + tmp_path, monkeypatch +): + dataset = lance.write_dataset(create_table(nvec=32, ndim=16), tmp_path) + calls = {} + + class FakeIndex: + centers = np.random.randn(4, 16).astype(np.float32) + pq_centers = np.random.randn(2, 256, 8).astype(np.float32) + + class FakeIvfPqModule: + class IndexParams: + def __init__(self, **kwargs): + calls.update(kwargs) + + @staticmethod + def build(build_params, matrix): + assert matrix.shape[1] == 16 + return FakeIndex() + + monkeypatch.setattr(lance_cuvs, "_require_cuvs", lambda: FakeIvfPqModule()) + + centroids, pq_codebook = lance_cuvs.train_ivf_pq_on_cuvs( + dataset, + "vector", + 4, + "l2", + "cuvs", + 2, + sample_rate=4, + ) + + assert calls["pq_dim"] == 8 + assert centroids.shape == (4, 16) + assert pq_codebook.shape == (2, 256, 8) + + def test_cuvs_as_numpy_prefers_copy_to_host(): class FakeDeviceTensor: def copy_to_host(self): From 1991638af63f13b8ae45e8d1d9d0373af2d5f767 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Fri, 3 Apr 2026 02:27:12 +0800 Subject: [PATCH 10/21] python: fix cuvs pq_dim semantics --- AGENTS.md | 6 ++++ python/python/lance/cuvs.py | 12 ++++++- python/python/tests/test_vector_index.py | 46 ++++++++++++++++++++++-- 3 files changed, 61 insertions(+), 3 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 8543d23521a..ec2b3e21773 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -53,6 +53,12 @@ cd test_data && docker compose up -d AWS_DEFAULT_REGION=us-east-1 pytest --run-integration python/tests/test_s3_ddb.py ``` +### Benchmarking Discipline + +- Benchmark machines must use release builds only. For Python bindings, always run `maturin develop --release` before collecting any timing data. +- Never use `maturin develop` without `--release` on a benchmark host. If a dev-profile rebuild is needed for functional debugging, use a different machine or clearly discard all performance results collected afterwards. +- Before trusting a benchmark result, verify the mounted benchmark volume and the active build profile. + ## Coding Standards ### General diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py index a3ad062d8fa..07ae8c01a6b 100644 --- a/python/python/lance/cuvs.py +++ b/python/python/lance/cuvs.py @@ -155,6 +155,7 @@ def _normalize_pq_codebook( pq_centers = _as_numpy(index.pq_centers) expected_shapes = { + (subvector_dim, num_sub_vectors, pq_book_size): (1, 2, 0), (num_sub_vectors, subvector_dim, pq_book_size): (0, 2, 1), (num_sub_vectors, pq_book_size, subvector_dim): None, } @@ -349,7 +350,7 @@ def _train_ivf_pq_index_on_cuvs( matrix.shape[0], num_partitions, sample_rate ), pq_bits=num_bits, - pq_dim=dimension // num_sub_vectors, + pq_dim=num_sub_vectors, codebook_kind="subspace", force_random_rotation=False, add_data_on_build=False, @@ -391,6 +392,15 @@ def one_pass_assign_ivf_pq_on_cuvs( "one_pass_assign_ivf_pq_on_cuvs requires a trained cuVS index for " "single-node transform" ) + transform_code_width = (trained_index.pq_dim * trained_index.pq_bits + 7) // 8 + if transform_code_width != num_sub_vectors: + raise ValueError( + "cuVS transform output is incompatible with Lance IVF_PQ for this " + "configuration: expected " + f"{num_sub_vectors} PQ code columns, but cuVS will produce " + f"{transform_code_width}. Use a configuration where " + "ceil(pq_dim * pq_bits / 8) == num_sub_vectors." + ) progress = _make_progress(num_rows) progress.set_description("Assigning partitions and computing pq codes") diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 3c7d92290b7..4448c7b57f5 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -743,7 +743,7 @@ def build(build_params, matrix): assert pq_codebook.shape == (4, 256, 4) -def test_train_ivf_pq_on_cuvs_uses_subvector_dimension_for_pq_dim( +def test_train_ivf_pq_on_cuvs_uses_num_sub_vectors_for_pq_dim( tmp_path, monkeypatch ): dataset = lance.write_dataset(create_table(nvec=32, ndim=16), tmp_path) @@ -775,11 +775,22 @@ def build(build_params, matrix): sample_rate=4, ) - assert calls["pq_dim"] == 8 + assert calls["pq_dim"] == 2 assert centroids.shape == (4, 16) assert pq_codebook.shape == (2, 256, 8) +def test_normalize_pq_codebook_accepts_subvector_dim_first_layout(): + class FakeIndex: + pq_centers = np.random.randn(8, 16, 256).astype(np.float32) + + pq_codebook = lance_cuvs._normalize_pq_codebook( + FakeIndex(), num_sub_vectors=16, num_bits=8, dimension=128 + ) + + assert pq_codebook.shape == (16, 256, 8) + + def test_cuvs_as_numpy_prefers_copy_to_host(): class FakeDeviceTensor: def copy_to_host(self): @@ -855,6 +866,37 @@ def transform(index, vectors): assert offsets_batch.column("offset").type == pa.uint64() +def test_one_pass_assign_ivf_pq_on_cuvs_rejects_incompatible_transform_width( + tmp_path, + monkeypatch, +): + tbl = create_table(nvec=32, ndim=128) + dataset = lance.write_dataset(tbl, tmp_path / "cuvs_assign_incompatible") + + ivf_centroids = np.random.randn(4, 128).astype(np.float32) + pq_codebook = np.random.randn(16, 256, 8).astype(np.float32) + monkeypatch.setattr(lance_cuvs, "_require_cuvs", lambda: object()) + + class FakeIndex: + pq_dim = 8 + pq_bits = 8 + + with pytest.raises( + ValueError, + match="cuVS transform output is incompatible with Lance IVF_PQ", + ): + lance_cuvs.one_pass_assign_ivf_pq_on_cuvs( + dataset, + "vector", + "l2", + "cuvs", + ivf_centroids, + pq_codebook, + trained_index=FakeIndex(), + batch_size=8, + ) + + def test_use_index(dataset, tmp_path): ann_ds = lance.write_dataset(dataset.to_table(), tmp_path / "indexed.lance") ann_ds = ann_ds.create_index( From 54c29d863f3ecf1a1df593f61f2afad985d237aa Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Fri, 3 Apr 2026 02:33:46 +0800 Subject: [PATCH 11/21] python: revert cuvs shuffle dataset integration --- python/python/lance/cuvs.py | 160 +++++------------------ python/python/tests/test_vector_index.py | 15 +-- 2 files changed, 40 insertions(+), 135 deletions(-) diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py index 07ae8c01a6b..0ab9fcd6d24 100644 --- a/python/python/lance/cuvs.py +++ b/python/python/lance/cuvs.py @@ -3,8 +3,6 @@ from __future__ import annotations -import json -import os import re import tempfile from importlib import import_module @@ -14,7 +12,6 @@ import pyarrow.compute as pc from .dependencies import numpy as np -from .file import LanceFileWriter from .log import LOGGER from .util import _normalize_metric_type @@ -195,114 +192,6 @@ def _sample_training_table( return dataset.to_table(columns=[column], filter=filt, limit=train_rows) -def _make_shuffle_batch( - row_ids: np.ndarray, - partitions: np.ndarray, - pq_codes: np.ndarray, - num_partitions: int, - num_sub_vectors: int, -) -> tuple[pa.RecordBatch, pa.RecordBatch]: - sort_indices = np.argsort(partitions, kind="stable") - row_ids = row_ids[sort_indices] - partitions = partitions[sort_indices] - pq_codes = pq_codes[sort_indices] - - pq_values = pa.array(pq_codes.reshape(-1), type=pa.uint8()) - pq_code_array = pa.FixedSizeListArray.from_arrays(pq_values, num_sub_vectors) - partition_counts = np.bincount(partitions, minlength=num_partitions).astype( - np.uint64, copy=False - ) - offsets = np.cumsum(partition_counts, dtype=np.uint64) - data_batch = pa.RecordBatch.from_arrays( - [ - pa.array(row_ids, type=pa.uint64()), - pq_code_array, - ], - schema=pa.schema( - [ - pa.field("_rowid", pa.uint64()), - pa.field("__pq_code", pa.list_(pa.uint8(), list_size=num_sub_vectors)), - ] - ), - ) - offsets_batch = pa.RecordBatch.from_arrays( - [pa.array(offsets, type=pa.uint64())], - schema=pa.schema([pa.field("offset", pa.uint64())]), - ) - return data_batch, offsets_batch - - -def _shuffle_metadata( - num_partitions: int, num_batches: int, partition_counts -) -> dict[str, str]: - return { - "lance:shuffle:num_partitions": str(num_partitions), - "lance:shuffle:num_batches": str(num_batches), - "lance:shuffle:partition_counts": json.dumps(list(partition_counts)), - "lance:shuffle:total_loss": "0.0", - } - - -def _write_v3_shuffle_files( - output_root: str, - batches: Iterator[tuple[pa.RecordBatch, pa.RecordBatch]], - *, - num_partitions: int, - num_sub_vectors: int, -) -> list[str]: - os.makedirs(output_root, exist_ok=True) - data_path = os.path.join(output_root, "shuffle_data.lance") - offsets_path = os.path.join(output_root, "shuffle_offsets.lance") - - data_schema = pa.schema( - [ - pa.field("_rowid", pa.uint64()), - pa.field("__pq_code", pa.list_(pa.uint8(), list_size=num_sub_vectors)), - ] - ) - offsets_schema = pa.schema([pa.field("offset", pa.uint64())]) - - data_writer = None - offsets_writer = LanceFileWriter(offsets_path, offsets_schema) - total_partition_counts = np.zeros(num_partitions, dtype=np.uint64) - global_row_count = np.uint64(0) - num_batches = 0 - - for data_batch, offsets_batch in batches: - if data_writer is None: - data_writer = LanceFileWriter(data_path, data_batch.schema) - data_writer.write_batch(data_batch) - - offsets = offsets_batch.column(0).to_numpy() - adjusted_offsets = offsets + global_row_count - offsets_writer.write_batch( - pa.RecordBatch.from_arrays( - [pa.array(adjusted_offsets, type=pa.uint64())], - schema=offsets_schema, - ) - ) - last_offset = np.uint64(0) - for idx, offset in enumerate(offsets): - total_partition_counts[idx] += np.uint64(offset) - last_offset - last_offset = np.uint64(offset) - global_row_count += np.uint64(data_batch.num_rows) - num_batches += 1 - - if data_writer is None: - data_writer = LanceFileWriter(data_path, data_schema) - - metadata = _shuffle_metadata( - num_partitions, num_batches, total_partition_counts.tolist() - ) - for key, value in metadata.items(): - data_writer.add_schema_metadata(key, value) - offsets_writer.add_schema_metadata(key, value) - - data_writer.close() - offsets_writer.close() - return ["shuffle_data.lance", "shuffle_offsets.lance"] - - def _train_ivf_pq_index_on_cuvs( dataset, column: str, @@ -375,6 +264,8 @@ def one_pass_assign_ivf_pq_on_cuvs( *, filter_nan: bool = True, ): + from . import write_dataset + if accelerator != "cuvs": raise ValueError("cuVS acceleration only supports accelerator='cuvs'") @@ -405,9 +296,15 @@ def one_pass_assign_ivf_pq_on_cuvs( progress = _make_progress(num_rows) progress.set_description("Assigning partitions and computing pq codes") - def _partition_and_pq_codes_assignment() -> Iterator[ - tuple[pa.RecordBatch, pa.RecordBatch] - ]: + output_schema = pa.schema( + [ + pa.field("row_id", pa.uint64()), + pa.field("__ivf_part_id", pa.uint32()), + pa.field("__pq_code", pa.list_(pa.uint8(), list_size=num_sub_vectors)), + ] + ) + + def _partition_and_pq_codes_assignment() -> Iterator[pa.RecordBatch]: for batch in dataset.to_batches( columns=[column], filter=filt, @@ -437,26 +334,37 @@ def _partition_and_pq_codes_assignment() -> Iterator[ f"expected {(len(row_ids), num_sub_vectors)}, got {pq_codes.shape}" ) - partition_batch = _make_shuffle_batch( - row_ids, - partitions, - pq_codes, - ivf_centroids.shape[0], - num_sub_vectors, + pq_values = pa.array(pq_codes.reshape(-1), type=pa.uint8()) + pq_code_array = pa.FixedSizeListArray.from_arrays( + pq_values, num_sub_vectors + ) + yield pa.RecordBatch.from_arrays( + [ + pa.array(row_ids, type=pa.uint64()), + pa.array(partitions, type=pa.uint32()), + pq_code_array, + ], + schema=output_schema, ) progress.update(len(row_ids)) - yield partition_batch if dst_dataset_uri is None: dst_dataset_uri = tempfile.mkdtemp() if re.search(r".:\\", dst_dataset_uri) is not None: dst_dataset_uri = dst_dataset_uri.replace("\\", "/", 1) - shuffle_buffers = _write_v3_shuffle_files( - str(dst_dataset_uri), - _partition_and_pq_codes_assignment(), - num_partitions=ivf_centroids.shape[0], - num_sub_vectors=num_sub_vectors, + + reader = pa.RecordBatchReader.from_batches( + output_schema, _partition_and_pq_codes_assignment() + ) + ds = write_dataset( + reader, + dst_dataset_uri, + schema=output_schema, + data_storage_version="2.2", ) + shuffle_buffers = [ + data_file.path for frag in ds.get_fragments() for data_file in frag.data_files() + ] progress.close() LOGGER.info("Saved precomputed pq_codes to %s", dst_dataset_uri) diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 4448c7b57f5..7ba5feefda6 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -853,17 +853,14 @@ def transform(index, vectors): batch_size=8, ) - from lance.file import LanceFileReader + shuffle_ds = lance.dataset(shuffle_uri) + data_batch = next(shuffle_ds.to_batches(batch_size=1024)) - data_reader = LanceFileReader(str(Path(shuffle_uri) / "shuffle_data.lance")) - offsets_reader = LanceFileReader(str(Path(shuffle_uri) / "shuffle_offsets.lance")) - data_batch = next(data_reader.read_all(batch_size=1024).to_batches()) - offsets_batch = next(offsets_reader.read_all(batch_size=1024).to_batches()) - - assert shuffle_buffers == ["shuffle_data.lance", "shuffle_offsets.lance"] - assert data_batch.column("_rowid").type == pa.uint64() + assert len(shuffle_buffers) > 0 + assert all(path.endswith(".lance") for path in shuffle_buffers) + assert data_batch.column("row_id").type == pa.uint64() + assert data_batch.column("__ivf_part_id").type == pa.uint32() assert data_batch.column("__pq_code").type == pa.list_(pa.uint8(), 4) - assert offsets_batch.column("offset").type == pa.uint64() def test_one_pass_assign_ivf_pq_on_cuvs_rejects_incompatible_transform_width( From 5ecefc4572b27d111bdb06e582f91742bd000a24 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Fri, 3 Apr 2026 12:31:32 +0800 Subject: [PATCH 12/21] Support precomputed encoded datasets for IVF_PQ build --- python/python/lance/cuvs.py | 56 +++ python/python/lance/dataset.py | 15 +- python/python/tests/test_vector_index.py | 19 +- python/src/dataset.rs | 13 +- rust/lance-index/src/vector/ivf/builder.rs | 10 +- rust/lance-index/src/vector/ivf/shuffler.rs | 10 +- rust/lance/src/index/vector.rs | 2 + rust/lance/src/index/vector/builder.rs | 101 ++++- .../lance/src/index/vector/encoded_dataset.rs | 370 ++++++++++++++++++ rust/lance/src/index/vector/ivf.rs | 30 +- rust/lance/src/index/vector/ivf/builder.rs | 5 +- 11 files changed, 600 insertions(+), 31 deletions(-) create mode 100644 rust/lance/src/index/vector/encoded_dataset.rs diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py index 0ab9fcd6d24..ecd3173c249 100644 --- a/python/python/lance/cuvs.py +++ b/python/python/lance/cuvs.py @@ -3,6 +3,7 @@ from __future__ import annotations +import json import re import tempfile from importlib import import_module @@ -18,6 +19,16 @@ if TYPE_CHECKING: from pathlib import Path +PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY = ( + "lance:index_build:precomputed_encoded_partition_sizes" +) +PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY = ( + "lance:index_build:precomputed_encoded_partition_fragment_ids" +) +PRECOMPUTED_ENCODED_TOTAL_LOSS_METADATA_KEY = ( + "lance:index_build:precomputed_encoded_total_loss" +) + def is_cuvs_accelerator(accelerator: object) -> bool: return accelerator == "cuvs" @@ -100,6 +111,45 @@ def _column_to_numpy(table: pa.Table | pa.RecordBatch, column: str) -> np.ndarra return _coerce_float_matrix(np.asarray(values), column=column) +def _annotate_precomputed_encoded_dataset( + dataset, + partition_sizes: list[int], + *, + total_loss: float | None = None, +) -> None: + partition_fragments = [[] for _ in range(len(partition_sizes))] + for fragment in dataset.get_fragments(): + fragment_partitions = set() + scanner = ( + dataset.scanner(columns=["__ivf_part_id"]) + .with_fragments([fragment]) + .to_scanner() + ) + for batch in scanner.to_batches(): + fragment_partitions.update( + int(partition_id) + for partition_id in np.unique( + batch.column("__ivf_part_id").to_numpy(zero_copy_only=False) + ) + ) + for partition_id in fragment_partitions: + partition_fragments[partition_id].append(int(fragment.metadata.id)) + + metadata = { + PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY: json.dumps( + [int(size) for size in partition_sizes] + ), + PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY: json.dumps( + partition_fragments + ), + } + if total_loss is not None: + metadata[PRECOMPUTED_ENCODED_TOTAL_LOSS_METADATA_KEY] = json.dumps( + float(total_loss) + ) + dataset.update_metadata(metadata) + + def _as_numpy(array_like) -> np.ndarray: if isinstance(array_like, np.ndarray): return array_like @@ -295,6 +345,8 @@ def one_pass_assign_ivf_pq_on_cuvs( progress = _make_progress(num_rows) progress.set_description("Assigning partitions and computing pq codes") + num_partitions = ivf_centroids.shape[0] + partition_sizes = np.zeros(num_partitions, dtype=np.int64) output_schema = pa.schema( [ @@ -327,6 +379,7 @@ def _partition_and_pq_codes_assignment() -> Iterator[pa.RecordBatch]: trained_index, _to_cuvs_transform_input(vectors) ) partitions = _as_numpy(partitions).astype(np.uint32, copy=False) + partition_sizes[:] += np.bincount(partitions, minlength=num_partitions) pq_codes = _as_numpy(pq_codes).astype(np.uint8, copy=False) if pq_codes.shape != (len(row_ids), num_sub_vectors): raise ValueError( @@ -362,6 +415,9 @@ def _partition_and_pq_codes_assignment() -> Iterator[pa.RecordBatch]: schema=output_schema, data_storage_version="2.2", ) + _annotate_precomputed_encoded_dataset( + ds, partition_sizes.astype(int).tolist() + ) shuffle_buffers = [ data_file.path for frag in ds.get_fragments() for data_file in frag.data_files() ] diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 5dee7767918..be5f7cd2c6f 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -2985,10 +2985,7 @@ def _create_index_impl( ) LOGGER.info("cuVS ivf+pq training time: %ss", ivfpq_train_time) timers["ivf+pq_assign:start"] = time.time() - ( - shuffle_output_dir, - shuffle_buffers, - ) = one_pass_assign_ivf_pq_on_cuvs( + shuffle_output_dir, _ = one_pass_assign_ivf_pq_on_cuvs( self, column[0], metric, @@ -3004,8 +3001,7 @@ def _create_index_impl( timers["ivf+pq_assign:end"] - timers["ivf+pq_assign:start"] ) LOGGER.info("cuVS ivf+pq transform time: %ss", ivfpq_assign_time) - kwargs["precomputed_shuffle_buffers"] = shuffle_buffers - kwargs["precomputed_shuffle_buffers_path"] = shuffle_output_dir + kwargs["precomputed_encoded_dataset_uri"] = shuffle_output_dir else: from .vector import ( one_pass_assign_ivf_pq_on_accelerator, @@ -3213,6 +3209,13 @@ def _create_index_impl( "Temporary shuffle buffers stored at %s, you may want to delete it.", kwargs["precomputed_shuffle_buffers_path"], ) + if "precomputed_encoded_dataset_uri" in kwargs.keys() and os.path.exists( + kwargs["precomputed_encoded_dataset_uri"] + ): + LOGGER.info( + "Temporary precomputed encoded dataset stored at %s, you may want to delete it.", + kwargs["precomputed_encoded_dataset_uri"], + ) return index def create_index( diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 7ba5feefda6..11c4fdb7ad0 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright The Lance Authors import logging +import json import os import platform import random @@ -612,6 +613,9 @@ def fake_assign( ), shuffle_ds_uri, ) + lance_cuvs._annotate_precomputed_encoded_dataset( + shuffle_ds, [len(row_ids), 0, 0, 0] + ) shuffle_buffers = [ data_file.path for frag in shuffle_ds.get_fragments() @@ -803,7 +807,7 @@ def copy_to_host(self): assert array.dtype == np.float32 -def test_one_pass_assign_ivf_pq_on_cuvs_writes_shuffle_buffers(tmp_path, monkeypatch): +def test_one_pass_assign_ivf_pq_on_cuvs_writes_encoded_dataset(tmp_path, monkeypatch): tbl = create_table(nvec=32, ndim=16) dataset = lance.write_dataset(tbl, tmp_path / "cuvs_assign_src") @@ -861,6 +865,19 @@ def transform(index, vectors): assert data_batch.column("row_id").type == pa.uint64() assert data_batch.column("__ivf_part_id").type == pa.uint32() assert data_batch.column("__pq_code").type == pa.list_(pa.uint8(), 4) + metadata = shuffle_ds.metadata() + assert json.loads( + metadata[ + lance_cuvs.PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY + ] + ) == [8, 8, 8, 8] + partition_fragments = json.loads( + metadata[ + lance_cuvs.PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY + ] + ) + assert len(partition_fragments) == 4 + assert all(partition_fragments) def test_one_pass_assign_ivf_pq_on_cuvs_rejects_incompatible_transform_width( diff --git a/python/src/dataset.rs b/python/src/dataset.rs index 35306636c93..62e852c117e 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -19,7 +19,6 @@ use chrono::{Duration, TimeDelta, Utc}; use futures::{StreamExt, TryFutureExt}; use lance_index::vector::bq::RQBuildParams; use log::error; -use object_store::path::Path; use pyo3::exceptions::{PyStopIteration, PyTypeError}; use pyo3::types::{PyBytes, PyInt, PyList, PySet, PyString, PyTuple}; use pyo3::{IntoPyObjectExt, prelude::*}; @@ -3372,6 +3371,10 @@ fn prepare_vector_index_params( ivf_params.precomputed_partitions_file = Some(f.to_string()); }; + if let Some(uri) = kwargs.get_item("precomputed_encoded_dataset_uri")? { + ivf_params.precomputed_encoded_dataset_uri = Some(uri.to_string()); + }; + if let Some(storage_options) = storage_options { ivf_params.storage_options = Some(storage_options); } @@ -3381,18 +3384,12 @@ fn prepare_vector_index_params( kwargs.get_item("precomputed_shuffle_buffers_path")?, ) { (Some(l), Some(p)) => { - let path = Path::parse(p.to_string()).map_err(|e| { - PyValueError::new_err(format!( - "Failed to parse precomputed_shuffle_buffers_path: {}", - e - )) - })?; let list = l .downcast::()? .iter() .map(|f| f.to_string()) .collect(); - ivf_params.precomputed_shuffle_buffers = Some((path, list)); + ivf_params.precomputed_shuffle_buffers = Some((p.to_string(), list)); } (None, None) => {} _ => { diff --git a/rust/lance-index/src/vector/ivf/builder.rs b/rust/lance-index/src/vector/ivf/builder.rs index 72e05555441..9154adbcd80 100644 --- a/rust/lance-index/src/vector/ivf/builder.rs +++ b/rust/lance-index/src/vector/ivf/builder.rs @@ -9,8 +9,6 @@ use std::sync::Arc; use arrow_array::cast::AsArray; use arrow_array::{Array, FixedSizeListArray, UInt32Array, UInt64Array}; use futures::TryStreamExt; -use object_store::path::Path; - use lance_core::error::{Error, Result}; use lance_io::stream::RecordBatchStream; @@ -48,7 +46,12 @@ pub struct IvfBuildParams { /// requires `centroids` to be set /// /// The input is expected to be (/dir/to/buffers, [buffer1.lance, buffer2.lance, ...]) - pub precomputed_shuffle_buffers: Option<(Path, Vec)>, + pub precomputed_shuffle_buffers: Option<(String, Vec)>, + + /// Precomputed encoded dataset (_rowid/row_id -> partition_id, pq_code). + /// Mutually exclusive with `precomputed_partitions_file` and `precomputed_shuffle_buffers`. + /// Requires `centroids` to be set. + pub precomputed_encoded_dataset_uri: Option, pub shuffle_partition_batches: usize, @@ -69,6 +72,7 @@ impl Default for IvfBuildParams { sample_rate: 256, // See faiss precomputed_partitions_file: None, precomputed_shuffle_buffers: None, + precomputed_encoded_dataset_uri: None, shuffle_partition_batches: 1024 * 10, shuffle_partition_concurrency: 2, storage_options: None, diff --git a/rust/lance-index/src/vector/ivf/shuffler.rs b/rust/lance-index/src/vector/ivf/shuffler.rs index f4e03c8f036..f78be7b0be2 100644 --- a/rust/lance-index/src/vector/ivf/shuffler.rs +++ b/rust/lance-index/src/vector/ivf/shuffler.rs @@ -246,12 +246,18 @@ pub async fn shuffle_dataset( num_partitions: u32, shuffle_partition_batches: usize, shuffle_partition_concurrency: usize, - precomputed_shuffle_buffers: Option<(Path, Vec)>, + precomputed_shuffle_buffers: Option<(String, Vec)>, ) -> Result>>> { // step 1: either use precomputed shuffle files or write shuffle data to a file let shuffler = if let Some((path, buffers)) = precomputed_shuffle_buffers { info!("Precomputed shuffle files provided, skip calculation of IVF partition."); - let mut shuffler = IvfShuffler::try_new(num_partitions, Some(path), true, None)?; + if path.contains("://") { + return Err(Error::not_supported( + "legacy IVF shuffler does not support remote precomputed_shuffle_buffers; use the V3 vector index builder path instead".to_string(), + )); + } + let mut shuffler = + IvfShuffler::try_new(num_partitions, Some(Path::parse(&path)?), true, None)?; unsafe { shuffler.set_unsorted_buffers(&buffers); } diff --git a/rust/lance/src/index/vector.rs b/rust/lance/src/index/vector.rs index c5c9038403e..13176c3bca8 100644 --- a/rust/lance/src/index/vector.rs +++ b/rust/lance/src/index/vector.rs @@ -8,6 +8,7 @@ use std::sync::Arc; use std::{any::Any, collections::HashMap}; pub mod builder; +mod encoded_dataset; pub mod ivf; pub mod pq; pub mod utils; @@ -1655,6 +1656,7 @@ fn derive_ivf_params(ivf_model: &IvfModel) -> IvfBuildParams { sample_rate: 256, // Default precomputed_partitions_file: None, precomputed_shuffle_buffers: None, + precomputed_encoded_dataset_uri: None, shuffle_partition_batches: 1024 * 10, // Default shuffle_partition_concurrency: 2, // Default storage_options: None, diff --git a/rust/lance/src/index/vector/builder.rs b/rust/lance/src/index/vector/builder.rs index 9a7001834d0..258e978f1ac 100644 --- a/rust/lance/src/index/vector/builder.rs +++ b/rust/lance/src/index/vector/builder.rs @@ -71,8 +71,11 @@ use lance_index::{ MIN_PARTITION_SIZE_PERCENT, }; use lance_io::local::to_local_path; +use lance_io::object_store::{ + ObjectStore, ObjectStoreParams, ObjectStoreRegistry, StorageOptionsAccessor, +}; use lance_io::stream::RecordBatchStream; -use lance_io::{object_store::ObjectStore, stream::RecordBatchStreamAdapter}; +use lance_io::stream::RecordBatchStreamAdapter; use lance_linalg::distance::{DistanceType, Dot, L2, Normalize}; use lance_linalg::kernels::normalize_fsl; use log::info; @@ -82,12 +85,14 @@ use tracing::{Level, instrument, span}; use crate::Dataset; use crate::dataset::ProjectionRequest; +use crate::dataset::builder::DatasetBuilder; use crate::dataset::index::dataset_format_version; use crate::index::vector::ivf::v2::PartitionEntry; use crate::index::vector::utils::{infer_vector_dim, infer_vector_element_type}; use super::v2::IVFIndex; use super::{ + encoded_dataset::EncodedDatasetShuffleReader, ivf::load_precomputed_partitions_if_available, utils::{self, get_vector_type}, }; @@ -145,9 +150,41 @@ type BuildStream = Pin::Storage, S, f64)>>> + Send>>; impl IvfIndexBuilder { + fn precomputed_shuffle_buffers_uri(root: &str) -> String { + let uri = root.to_string(); + if uri.contains("://") { + uri + } else { + to_local_path(&Path::from(root)) + } + } + + fn precomputed_shuffle_buffers_root_uri(root: &str) -> String { + let uri = Self::precomputed_shuffle_buffers_uri(root); + if uri.ends_with("/data") { + uri.trim_end_matches("/data").to_string() + } else { + uri + } + } + + fn object_store_params(&self) -> ObjectStoreParams { + let mut params = ObjectStoreParams::default(); + if let Some(storage_options) = self + .ivf_params + .as_ref() + .and_then(|params| params.storage_options.clone()) + { + params.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_static_options(storage_options), + )); + } + params + } + async fn try_open_precomputed_v3_shuffle_reader( &self, - root: &Path, + root: &str, files: &[String], ) -> Result>> { if files.len() != 2 { @@ -169,11 +206,19 @@ impl IvfIndexBuilder let (Some(data_file), Some(offsets_file)) = (data_file, offsets_file) else { return Ok(None); }; + let registry = Arc::new(ObjectStoreRegistry::default()); + let params = self.object_store_params(); + let (object_store, output_dir) = ObjectStore::from_uri_and_params( + registry, + &Self::precomputed_shuffle_buffers_root_uri(root), + ¶ms, + ) + .await?; Ok(Some( TwoFileShuffleReader::try_open_existing( - Arc::new(ObjectStore::local()), - root.clone(), + object_store, + output_dir, data_file, offsets_file, ) @@ -182,6 +227,19 @@ impl IvfIndexBuilder )) } + async fn try_open_precomputed_encoded_dataset_reader( + &self, + uri: &str, + ) -> Result> { + let storage_options = self + .ivf_params + .as_ref() + .and_then(|params| params.storage_options.as_ref()); + Ok(Arc::new( + EncodedDatasetShuffleReader::try_open(uri, storage_options).await?, + )) + } + #[allow(clippy::too_many_arguments)] pub fn new( dataset: Dataset, @@ -564,6 +622,19 @@ impl IvfIndexBuilder return Err(Error::invalid_input("dataset not set before shuffling")); }; + if let Some(uri) = self + .ivf_params + .as_ref() + .and_then(|params| params.precomputed_encoded_dataset_uri.as_deref()) + { + log::info!("shuffle with precomputed encoded dataset from {}", uri); + self.shuffle_reader = Some( + self.try_open_precomputed_encoded_dataset_reader(uri) + .await?, + ); + return Ok(()); + } + let stream = match self .ivf_params .as_ref() @@ -579,7 +650,7 @@ impl IvfIndexBuilder return Ok(()); } - let uri = to_local_path(uri); + let uri = Self::precomputed_shuffle_buffers_root_uri(uri); let uri = if StdPath::new(&uri) .file_name() .is_some_and(|name| name == "data") @@ -592,7 +663,15 @@ impl IvfIndexBuilder uri }; log::info!("shuffle with precomputed shuffle buffers from {}", uri); - let ds = Dataset::open(&uri).await?; + let mut builder = DatasetBuilder::from_uri(&uri); + if let Some(storage_options) = self + .ivf_params + .as_ref() + .and_then(|params| params.storage_options.clone()) + { + builder = builder.with_storage_options(storage_options); + } + let ds = builder.load().await?; ds.scan().try_into_stream().await? } _ => { @@ -2296,4 +2375,14 @@ mod tests { let row_ids = batches[0][ROW_ID].as_primitive::(); assert_eq!(row_ids.values(), &[4, 3, 2, 1, 0]); } + + #[test] + fn precomputed_shuffle_buffer_uri_preserves_remote_uri() { + assert_eq!( + IvfIndexBuilder::::precomputed_shuffle_buffers_root_uri( + "s3://bucket/shuffle" + ), + "s3://bucket/shuffle" + ); + } } diff --git a/rust/lance/src/index/vector/encoded_dataset.rs b/rust/lance/src/index/vector/encoded_dataset.rs new file mode 100644 index 00000000000..866f903805c --- /dev/null +++ b/rust/lance/src/index/vector/encoded_dataset.rs @@ -0,0 +1,370 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::collections::HashMap; +use std::sync::Arc; + +use arrow_schema::Fields; +use futures::StreamExt; +use lance_core::utils::tokio::get_num_compute_intensive_cpus; +use lance_core::{Error, ROW_ID, Result}; +use lance_index::vector::v3::shuffler::ShuffleReader; +use lance_index::vector::{PART_ID_COLUMN, PQ_CODE_COLUMN}; +use lance_io::stream::{RecordBatchStream, RecordBatchStreamAdapter}; +use lance_table::format::Fragment; +use log::warn; +use serde::de::DeserializeOwned; + +use crate::Dataset; +use crate::dataset::builder::DatasetBuilder; + +pub(crate) const PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY: &str = + "lance:index_build:precomputed_encoded_partition_sizes"; +pub(crate) const PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY: &str = + "lance:index_build:precomputed_encoded_partition_fragment_ids"; +pub(crate) const PRECOMPUTED_ENCODED_TOTAL_LOSS_METADATA_KEY: &str = + "lance:index_build:precomputed_encoded_total_loss"; + +const PRECOMPUTED_ROW_ID_COLUMN: &str = "row_id"; + +pub(crate) struct EncodedDatasetShuffleReader { + dataset: Dataset, + row_id_column: String, + partition_sizes: Vec, + partition_fragments: Option>>, + total_loss: Option, +} + +impl EncodedDatasetShuffleReader { + pub(crate) async fn try_open( + uri: &str, + storage_options: Option<&HashMap>, + ) -> Result { + let mut builder = DatasetBuilder::from_uri(uri); + if let Some(storage_options) = storage_options { + builder = builder.with_storage_options(storage_options.clone()); + } + let dataset = builder.load().await?; + Self::try_new(dataset) + } + + pub(crate) fn try_new(dataset: Dataset) -> Result { + let row_id_column = if dataset.schema().field(ROW_ID).is_some() { + ROW_ID.to_string() + } else if dataset.schema().field(PRECOMPUTED_ROW_ID_COLUMN).is_some() { + PRECOMPUTED_ROW_ID_COLUMN.to_string() + } else { + return Err(Error::invalid_input(format!( + "precomputed encoded dataset must contain '{}' or '{}' column", + ROW_ID, PRECOMPUTED_ROW_ID_COLUMN + ))); + }; + + for required_column in [PART_ID_COLUMN, PQ_CODE_COLUMN] { + if dataset.schema().field(required_column).is_none() { + return Err(Error::invalid_input(format!( + "precomputed encoded dataset is missing required column '{}'", + required_column + ))); + } + } + + let metadata = dataset.metadata(); + let partition_sizes: Vec = + parse_required_metadata(metadata, PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY)?; + + let partition_fragments = parse_optional_metadata::>>( + metadata, + PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY, + )? + .map(|partition_fragment_ids| resolve_partition_fragments(&dataset, partition_fragment_ids)) + .transpose()?; + + if let Some(partition_fragments) = partition_fragments.as_ref() { + if partition_fragments.len() != partition_sizes.len() { + return Err(Error::invalid_input(format!( + "metadata '{}' has {} partitions but '{}' has {}", + PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY, + partition_fragments.len(), + PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY, + partition_sizes.len(), + ))); + } + } + + let total_loss = + parse_optional_metadata::(metadata, PRECOMPUTED_ENCODED_TOTAL_LOSS_METADATA_KEY)?; + + Ok(Self { + dataset, + row_id_column, + partition_sizes, + partition_fragments, + total_loss, + }) + } + + fn rename_row_id( + stream: impl RecordBatchStream + Unpin + 'static, + row_id_idx: usize, + ) -> impl RecordBatchStream + Unpin + 'static { + let new_schema = Arc::new(arrow_schema::Schema::new( + stream + .schema() + .fields + .iter() + .enumerate() + .map(|(field_idx, field)| { + if field_idx == row_id_idx { + arrow_schema::Field::new( + ROW_ID, + field.data_type().clone(), + field.is_nullable(), + ) + } else { + field.as_ref().clone() + } + }) + .collect::(), + )); + RecordBatchStreamAdapter::new( + new_schema.clone(), + stream.map(move |batch| match batch { + Ok(batch) => { + arrow_array::RecordBatch::try_new(new_schema.clone(), batch.columns().to_vec()) + .map_err(Error::from) + } + Err(error) => Err(error), + }), + ) + } +} + +#[async_trait::async_trait] +impl ShuffleReader for EncodedDatasetShuffleReader { + async fn read_partition( + &self, + partition_id: usize, + ) -> Result>> { + if partition_id >= self.partition_sizes.len() { + return Ok(None); + } + if self.partition_sizes[partition_id] == 0 { + return Ok(None); + } + + let mut scanner = self.dataset.scan(); + scanner.batch_readahead(get_num_compute_intensive_cpus()); + scanner.project(&[self.row_id_column.as_str(), PART_ID_COLUMN, PQ_CODE_COLUMN])?; + + if let Some(partition_fragments) = self.partition_fragments.as_ref() { + let fragments = &partition_fragments[partition_id]; + if fragments.is_empty() { + warn!( + "precomputed encoded dataset metadata has no fragments for non-empty partition {}, falling back to filtered scan", + partition_id + ); + } else { + scanner.with_fragments(fragments.clone()); + } + } + + scanner.filter(&format!("{PART_ID_COLUMN} = {partition_id}"))?; + let stream = scanner.try_into_stream().await?; + if let Some((row_id_idx, _)) = stream.schema().column_with_name(PRECOMPUTED_ROW_ID_COLUMN) { + Ok(Some(Box::new(Self::rename_row_id(stream, row_id_idx)))) + } else { + Ok(Some(Box::new(stream))) + } + } + + fn partition_size(&self, partition_id: usize) -> Result { + Ok(self.partition_sizes.get(partition_id).copied().unwrap_or(0)) + } + + fn total_loss(&self) -> Option { + self.total_loss + } +} + +fn parse_required_metadata( + metadata: &HashMap, + key: &str, +) -> Result { + let value = metadata.get(key).ok_or_else(|| { + Error::invalid_input(format!( + "precomputed encoded dataset is missing required metadata '{}'", + key + )) + })?; + parse_metadata_value(value, key) +} + +fn parse_optional_metadata( + metadata: &HashMap, + key: &str, +) -> Result> { + metadata + .get(key) + .map(|value| parse_metadata_value(value, key)) + .transpose() +} + +fn parse_metadata_value(value: &str, key: &str) -> Result { + serde_json::from_str(value).map_err(|error| { + Error::invalid_input(format!( + "failed to parse precomputed encoded dataset metadata '{}' from '{}': {}", + key, value, error + )) + }) +} + +fn resolve_partition_fragments( + dataset: &Dataset, + partition_fragment_ids: Vec>, +) -> Result>> { + let fragments_by_id = dataset + .fragments() + .iter() + .cloned() + .map(|fragment| (fragment.id, fragment)) + .collect::>(); + + partition_fragment_ids + .into_iter() + .map(|fragment_ids| { + fragment_ids + .into_iter() + .map(|fragment_id| { + fragments_by_id.get(&fragment_id).cloned().ok_or_else(|| { + Error::invalid_input(format!( + "precomputed encoded dataset metadata references unknown fragment id {}", + fragment_id + )) + }) + }) + .collect() + }) + .collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + use arrow_array::{ + FixedSizeListArray, RecordBatch, RecordBatchIterator, UInt8Array, UInt32Array, UInt64Array, + cast::AsArray, + }; + use futures::TryStreamExt; + use lance_arrow::FixedSizeListArrayExt; + + use crate::dataset::WriteParams; + + #[tokio::test] + async fn encoded_dataset_reader_reads_mapped_fragments_and_renames_row_id() { + let schema = Arc::new(arrow_schema::Schema::new(vec![ + arrow_schema::Field::new("row_id", arrow_schema::DataType::UInt64, false), + arrow_schema::Field::new(PART_ID_COLUMN, arrow_schema::DataType::UInt32, false), + arrow_schema::Field::new( + PQ_CODE_COLUMN, + arrow_schema::DataType::FixedSizeList( + Arc::new(arrow_schema::Field::new( + "item", + arrow_schema::DataType::UInt8, + true, + )), + 2, + ), + true, + ), + ])); + + let batch1 = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(UInt64Array::from(vec![10_u64, 11])), + Arc::new(UInt32Array::from(vec![0_u32, 1])), + Arc::new( + FixedSizeListArray::try_new_from_values(UInt8Array::from(vec![1, 2, 3, 4]), 2) + .unwrap(), + ), + ], + ) + .unwrap(); + let batch2 = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(UInt64Array::from(vec![12_u64, 13])), + Arc::new(UInt32Array::from(vec![1_u32, 1])), + Arc::new( + FixedSizeListArray::try_new_from_values(UInt8Array::from(vec![5, 6, 7, 8]), 2) + .unwrap(), + ), + ], + ) + .unwrap(); + + let reader = RecordBatchIterator::new(vec![Ok(batch1), Ok(batch2)], schema); + let write_params = WriteParams { + max_rows_per_file: 2, + max_rows_per_group: 2, + ..Default::default() + }; + let mut dataset = Dataset::write( + reader, + "memory://precomputed-encoded-reader", + Some(write_params), + ) + .await + .unwrap(); + + let fragment_ids = dataset + .get_fragments() + .into_iter() + .map(|fragment| fragment.metadata().id) + .collect::>(); + assert_eq!(fragment_ids.len(), 2); + + dataset + .update_metadata(vec![ + ( + PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY.to_string(), + serde_json::to_string(&vec![1_usize, 3]).unwrap(), + ), + ( + PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY.to_string(), + serde_json::to_string(&vec![ + vec![fragment_ids[0] as u64], + vec![fragment_ids[0] as u64, fragment_ids[1] as u64], + ]) + .unwrap(), + ), + ]) + .await + .unwrap(); + + let reader = EncodedDatasetShuffleReader::try_new(dataset).unwrap(); + assert_eq!(reader.partition_size(0).unwrap(), 1); + assert_eq!(reader.partition_size(1).unwrap(), 3); + + let stream = reader.read_partition(1).await.unwrap().unwrap(); + let batches = stream.try_collect::>().await.unwrap(); + let row_ids = batches + .iter() + .flat_map(|batch| { + batch[ROW_ID] + .as_primitive::() + .values() + .iter() + .copied() + }) + .collect::>(); + assert_eq!(row_ids, vec![11, 12, 13]); + assert!( + batches + .iter() + .all(|batch| batch.column_by_name("row_id").is_none()) + ); + } +} diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs index b67e6ea8e81..1f2b47887db 100644 --- a/rust/lance/src/index/vector/ivf.rs +++ b/rust/lance/src/index/vector/ivf.rs @@ -1204,6 +1204,12 @@ fn sanity_check_ivf_params(ivf: &IvfBuildParams) -> Result<()> { )); } + if ivf.precomputed_encoded_dataset_uri.is_some() && ivf.centroids.is_none() { + return Err(Error::index( + "precomputed_encoded_dataset_uri requires centroids to be set".to_string(), + )); + } + if ivf.precomputed_shuffle_buffers.is_some() && ivf.precomputed_partitions_file.is_some() { return Err(Error::index( "precomputed_shuffle_buffers and precomputed_partitions_file are mutually exclusive" @@ -1211,6 +1217,20 @@ fn sanity_check_ivf_params(ivf: &IvfBuildParams) -> Result<()> { )); } + if ivf.precomputed_encoded_dataset_uri.is_some() && ivf.precomputed_partitions_file.is_some() { + return Err(Error::index( + "precomputed_encoded_dataset_uri and precomputed_partitions_file are mutually exclusive" + .to_string(), + )); + } + + if ivf.precomputed_encoded_dataset_uri.is_some() && ivf.precomputed_shuffle_buffers.is_some() { + return Err(Error::index( + "precomputed_encoded_dataset_uri and precomputed_shuffle_buffers are mutually exclusive" + .to_string(), + )); + } + Ok(()) } @@ -1222,6 +1242,12 @@ fn sanity_check_params(ivf: &IvfBuildParams, pq: &PQBuildParams) -> Result<()> { )); } + if ivf.precomputed_encoded_dataset_uri.is_some() && pq.codebook.is_none() { + return Err(Error::index( + "precomputed_encoded_dataset_uri requires codebooks to be set".to_string(), + )); + } + Ok(()) } @@ -1698,7 +1724,7 @@ async fn write_ivf_pq_file( precomputed_partitions: Option>, shuffle_partition_batches: usize, shuffle_partition_concurrency: usize, - precomputed_shuffle_buffers: Option<(Path, Vec)>, + precomputed_shuffle_buffers: Option<(String, Vec)>, ) -> Result<()> { let path = index_dir.child(uuid).child(INDEX_FILE_NAME); let mut writer = object_store.create(&path).await?; @@ -1791,7 +1817,7 @@ async fn write_ivf_hnsw_file( precomputed_partitions: Option>, shuffle_partition_batches: usize, shuffle_partition_concurrency: usize, - precomputed_shuffle_buffers: Option<(Path, Vec)>, + precomputed_shuffle_buffers: Option<(String, Vec)>, ) -> Result<()> { let object_store = dataset.object_store(); let path = dataset.indices_dir().child(uuid).child(INDEX_FILE_NAME); diff --git a/rust/lance/src/index/vector/ivf/builder.rs b/rust/lance/src/index/vector/ivf/builder.rs index 9bd1ba95803..bcd47ae4057 100644 --- a/rust/lance/src/index/vector/ivf/builder.rs +++ b/rust/lance/src/index/vector/ivf/builder.rs @@ -22,7 +22,6 @@ use lance_index::vector::{ivf::storage::IvfModel, transform::Transformer}; use lance_io::stream::RecordBatchStreamAdapter; use lance_table::io::manifest::ManifestDescribing; use log::info; -use object_store::path::Path; use tracing::instrument; use lance_core::{Error, ROW_ID, Result, traits::DatasetTakeRows}; @@ -55,7 +54,7 @@ pub(super) async fn build_partitions( precomputed_partitions: Option>, shuffle_partition_batches: usize, shuffle_partition_concurrency: usize, - precomputed_shuffle_buffers: Option<(Path, Vec)>, + precomputed_shuffle_buffers: Option<(String, Vec)>, ) -> Result<()> { let schema = data.schema(); if schema.column_with_name(column).is_none() { @@ -254,7 +253,7 @@ pub(super) async fn build_hnsw_partitions( precomputed_partitions: Option>, shuffle_partition_batches: usize, shuffle_partition_concurrency: usize, - precomputed_shuffle_buffers: Option<(Path, Vec)>, + precomputed_shuffle_buffers: Option<(String, Vec)>, ) -> Result<(Vec, IvfModel)> { let schema = data.schema(); if schema.column_with_name(column).is_none() { From e3f29f56484b1824b44dfe976f2e4db31573a632 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Fri, 3 Apr 2026 13:45:56 +0800 Subject: [PATCH 13/21] python: fix fragment scans in cuvs encoded dataset metadata --- python/python/lance/cuvs.py | 6 +---- python/python/tests/test_vector_index.py | 29 ++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py index ecd3173c249..5940edbdea3 100644 --- a/python/python/lance/cuvs.py +++ b/python/python/lance/cuvs.py @@ -120,11 +120,7 @@ def _annotate_precomputed_encoded_dataset( partition_fragments = [[] for _ in range(len(partition_sizes))] for fragment in dataset.get_fragments(): fragment_partitions = set() - scanner = ( - dataset.scanner(columns=["__ivf_part_id"]) - .with_fragments([fragment]) - .to_scanner() - ) + scanner = fragment.scanner(columns=["__ivf_part_id"]) for batch in scanner.to_batches(): fragment_partitions.update( int(partition_id) diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 11c4fdb7ad0..1d17423bd68 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -807,6 +807,35 @@ def copy_to_host(self): assert array.dtype == np.float32 +def test_annotate_precomputed_encoded_dataset_scans_fragment_directly(tmp_path): + dataset_uri = tmp_path / "encoded_dataset" + + def make_table(partition_ids: list[int], row_id_start: int): + part_ids = np.asarray(partition_ids, dtype=np.uint32) + row_ids = pa.array( + np.arange(row_id_start, row_id_start + len(partition_ids), dtype=np.uint64) + ) + pq_values = pa.array(np.zeros(len(partition_ids) * 4, dtype=np.uint8)) + pq_codes = pa.FixedSizeListArray.from_arrays(pq_values, 4) + return pa.Table.from_arrays( + [row_ids, pa.array(part_ids), pq_codes], + names=["row_id", "__ivf_part_id", "__pq_code"], + ) + + ds = lance.write_dataset(make_table([0, 1, 1, 0], 0), dataset_uri) + ds = lance.write_dataset(make_table([2, 3, 2, 3], 4), dataset_uri, mode="append") + + lance_cuvs._annotate_precomputed_encoded_dataset(ds, [2, 2, 2, 2]) + + metadata = ds.metadata() + partition_fragments = json.loads( + metadata[ + lance_cuvs.PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY + ] + ) + assert partition_fragments == [[0], [0], [1], [1]] + + def test_one_pass_assign_ivf_pq_on_cuvs_writes_encoded_dataset(tmp_path, monkeypatch): tbl = create_table(nvec=32, ndim=16) dataset = lance.write_dataset(tbl, tmp_path / "cuvs_assign_src") From 769a218c2fb8885f31cca1cbcd02a4e751ccd62e Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Tue, 7 Apr 2026 16:21:57 +0800 Subject: [PATCH 14/21] feat: add partition artifacts for cuvs builds --- Cargo.toml | 2 +- python/Cargo.lock | 112 +- python/Cargo.toml | 2 + python/python/lance/cuvs.py | 188 ++- python/python/lance/dataset.py | 11 +- python/python/lance/lance/__init__.pyi | 14 + python/python/tests/test_vector_index.py | 188 ++- python/src/dataset.rs | 4 + python/src/file.rs | 77 + python/src/indices.rs | 91 ++ python/src/lib.rs | 4 +- rust/lance-cuvs/Cargo.toml | 28 + rust/lance-cuvs/src/lib.rs | 1237 +++++++++++++++++ rust/lance-index/src/vector/ivf/builder.rs | 5 + rust/lance/src/index/vector.rs | 3 + rust/lance/src/index/vector/builder.rs | 27 + rust/lance/src/index/vector/ivf.rs | 37 + .../src/index/vector/partition_artifact.rs | 956 +++++++++++++ rust/lance/src/index/vector/utils.rs | 2 +- 19 files changed, 2913 insertions(+), 75 deletions(-) create mode 100644 rust/lance-cuvs/Cargo.toml create mode 100644 rust/lance-cuvs/src/lib.rs create mode 100644 rust/lance/src/index/vector/partition_artifact.rs diff --git a/Cargo.toml b/Cargo.toml index c922eff6b8b..bddb49ed4a3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ members = [ "rust/compression/bitpacking", "rust/arrow-scalar", ] -exclude = ["python", "java/lance-jni"] +exclude = ["python", "java/lance-jni", "rust/lance-cuvs"] # Python package needs to be built by maturin. resolver = "3" diff --git a/python/Cargo.lock b/python/Cargo.lock index 4507a617872..aa4cfb72154 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -14,7 +14,7 @@ dependencies = [ "core_extensions", "crossbeam-channel", "generational-arena", - "libloading", + "libloading 0.7.4", "lock_api", "parking_lot", "paste", @@ -1070,6 +1070,26 @@ dependencies = [ "virtue", ] +[[package]] +name = "bindgen" +version = "0.72.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" +dependencies = [ + "bitflags 2.11.0", + "cexpr", + "clang-sys", + "itertools 0.13.0", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn 2.0.117", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -1269,6 +1289,15 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4f4c707c6a209cbe82d10abd08e1ea8995e9ea937d2550646e02798948992be0" +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom 7.1.3", +] + [[package]] name = "cfg-if" version = "1.0.4" @@ -1315,6 +1344,17 @@ dependencies = [ "inout", ] +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading 0.8.9", +] + [[package]] name = "cmake" version = "0.1.58" @@ -1577,6 +1617,26 @@ dependencies = [ "memchr", ] +[[package]] +name = "cuvs" +version = "26.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9778fa1e16f42539772496e9adba2a29c67dca84bcb0d247795f9cb3135ba87d" +dependencies = [ + "cuvs-sys", + "ndarray 0.15.6", +] + +[[package]] +name = "cuvs-sys" +version = "26.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4cad121da7a7ac908965352ffeac029a93fb0e3a1278a271f7204098b8724e9" +dependencies = [ + "bindgen", + "cmake", +] + [[package]] name = "darling" version = "0.20.11" @@ -3985,6 +4045,30 @@ dependencies = [ "url", ] +[[package]] +name = "lance-cuvs" +version = "5.0.0-beta.2" +dependencies = [ + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-schema", + "cuvs", + "cuvs-sys", + "futures", + "half", + "lance", + "lance-arrow", + "lance-core", + "lance-file", + "lance-index", + "lance-io", + "lance-linalg", + "log", + "ndarray 0.16.1", + "tokio", +] + [[package]] name = "lance-datafusion" version = "5.0.0-beta.2" @@ -4167,7 +4251,7 @@ dependencies = [ "lindera", "lindera-tantivy", "log", - "ndarray", + "ndarray 0.16.1", "num-traits", "object_store", "prost", @@ -4468,6 +4552,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "libloading" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +dependencies = [ + "cfg-if", + "windows-link", +] + [[package]] name = "liblzma" version = "0.4.6" @@ -4894,6 +4988,19 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b" +[[package]] +name = "ndarray" +version = "0.15.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "rawpointer", +] + [[package]] name = "ndarray" version = "0.16.1" @@ -5689,6 +5796,7 @@ dependencies = [ "lance", "lance-arrow", "lance-core", + "lance-cuvs", "lance-datafusion", "lance-datagen", "lance-encoding", diff --git a/python/Cargo.toml b/python/Cargo.toml index a3542f7360f..d63a8e113d4 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -35,6 +35,7 @@ lance = { path = "../rust/lance", features = [ ] } lance-arrow = { path = "../rust/lance-arrow" } lance-core = { path = "../rust/lance-core" } +lance-cuvs = { path = "../rust/lance-cuvs", optional = true } lance-datagen = { path = "../rust/lance-datagen", optional = true } lance-encoding = { path = "../rust/lance-encoding" } lance-file = { path = "../rust/lance-file" } @@ -75,6 +76,7 @@ bytes = "1.4" [features] default = [] +cuvs = ["dep:lance-cuvs"] datagen = ["lance-datagen"] fp16kernels = ["lance/fp16kernels"] diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py index 5940edbdea3..b36fd18c564 100644 --- a/python/python/lance/cuvs.py +++ b/python/python/lance/cuvs.py @@ -12,6 +12,8 @@ import pyarrow as pa import pyarrow.compute as pc +from .file import LanceFileSession +from .lance import PartitionArtifactBuilder from .dependencies import numpy as np from .log import LOGGER from .util import _normalize_metric_type @@ -29,6 +31,36 @@ "lance:index_build:precomputed_encoded_total_loss" ) +PARTITION_ARTIFACT_MANIFEST_VERSION = 1 +PARTITION_ARTIFACT_MANIFEST_FILE_NAME = "manifest.json" +PARTITION_ARTIFACT_METADATA_FILE_NAME = "metadata.lance" +PARTITION_ARTIFACT_PARTITIONS_DIR = "partitions" +DEFAULT_PARTITION_ARTIFACT_BUCKETS = 256 +PARTITION_ARTIFACT_ROW_ID_COLUMN = "_rowid" + +try: + from . import lance as _lance_ext + + _assign_ivf_pq_on_cuvs_rust_impl = getattr( + _lance_ext.indices, "_assign_ivf_pq_on_cuvs_rust" + ) + _train_ivf_pq_on_cuvs_rust_impl = getattr( + _lance_ext.indices, "_train_ivf_pq_on_cuvs_rust" + ) +except (ImportError, AttributeError): + _assign_ivf_pq_on_cuvs_rust_impl = None + _train_ivf_pq_on_cuvs_rust_impl = None + + +def _has_rust_cuvs_backend() -> bool: + return ( + _train_ivf_pq_on_cuvs_rust_impl is not None + and _assign_ivf_pq_on_cuvs_rust_impl is not None + ) + +def _unwrap_dataset(dataset): + return getattr(dataset, "_ds", dataset) + def is_cuvs_accelerator(accelerator: object) -> bool: return accelerator == "cuvs" @@ -170,6 +202,97 @@ def _as_numpy(array_like) -> np.ndarray: raise TypeError("Unable to convert cuVS output to numpy") +def _normalize_artifact_root(path_or_uri: str | Path) -> str: + root = str(path_or_uri) + if re.search(r".:\\", root) is not None: + root = root.replace("\\", "/", 1) + return root + + +def _make_metadata_table( + ivf_centroids: np.ndarray, + pq_codebook: np.ndarray, +) -> pa.Table: + dimension = ivf_centroids.shape[1] + subvector_dim = pq_codebook.shape[2] + ivf_type = pa.list_(pa.list_(pa.float32(), dimension)) + pq_type = pa.list_(pa.list_(pa.float32(), subvector_dim)) + ivf_values = pa.array([ivf_centroids.tolist()], type=ivf_type) + pq_values = pa.array( + [pq_codebook.reshape(-1, subvector_dim).tolist()], + type=pq_type, + ) + return pa.Table.from_arrays( + [ivf_values, pq_values], + names=["_ivf_centroids", "_pq_codebook"], + ) + + +def _write_partition_artifact_metadata( + session: LanceFileSession, + *, + ivf_centroids: np.ndarray, + pq_codebook: np.ndarray, + metric_type: str, + num_bits: int, +) -> None: + metadata_table = _make_metadata_table(ivf_centroids, pq_codebook) + with session.open_writer( + PARTITION_ARTIFACT_METADATA_FILE_NAME, + schema=metadata_table.schema, + version="2.2", + ) as writer: + writer.add_schema_metadata("lance:index_build:artifact_version", "1") + writer.add_schema_metadata( + "lance:index_build:distance_type", _normalize_metric_type(metric_type) + ) + writer.add_schema_metadata( + "lance:index_build:num_partitions", str(ivf_centroids.shape[0]) + ) + writer.add_schema_metadata( + "lance:index_build:num_sub_vectors", str(pq_codebook.shape[0]) + ) + writer.add_schema_metadata("lance:index_build:num_bits", str(num_bits)) + writer.add_schema_metadata("lance:index_build:dimension", str(ivf_centroids.shape[1])) + writer.write_batch(metadata_table) + + +def _write_partition_artifact( + batches: Iterator[pa.RecordBatch], + *, + artifact_root: str | Path, + ivf_centroids: np.ndarray, + pq_codebook: np.ndarray, + metric_type: str, + num_bits: int, + num_partitions: int, + total_loss: float | None = None, +) -> tuple[str, list[str]]: + artifact_root = _normalize_artifact_root(artifact_root) + session = LanceFileSession(artifact_root) + builder = PartitionArtifactBuilder( + artifact_root, + num_partitions=num_partitions, + pq_code_width=pq_codebook.shape[0], + ) + for batch in batches: + builder.append_batch(batch) + + _write_partition_artifact_metadata( + session, + ivf_centroids=ivf_centroids, + pq_codebook=pq_codebook, + metric_type=metric_type, + num_bits=num_bits, + ) + artifact_files = builder.finish( + PARTITION_ARTIFACT_METADATA_FILE_NAME, + float(total_loss) if total_loss is not None else None, + ) + artifact_files.insert(1, PARTITION_ARTIFACT_METADATA_FILE_NAME) + return artifact_root, artifact_files + + def _to_cuvs_transform_input(matrix: np.ndarray): cupy = _optional_cupy() if cupy is None: @@ -251,6 +374,19 @@ def _train_ivf_pq_index_on_cuvs( num_bits: int = 8, filter_nan: bool = True, ): + if _has_rust_cuvs_backend(): + return _train_ivf_pq_on_cuvs_rust_impl( + _unwrap_dataset(dataset), + column, + num_partitions, + metric_type, + num_sub_vectors, + sample_rate=sample_rate, + max_iters=max_iters, + num_bits=num_bits, + filter_nan=filter_nan, + ) + if accelerator != "cuvs": raise ValueError("cuVS acceleration only supports accelerator='cuvs'") if num_bits != 8: @@ -310,7 +446,26 @@ def one_pass_assign_ivf_pq_on_cuvs( *, filter_nan: bool = True, ): - from . import write_dataset + if _has_rust_cuvs_backend(): + if accelerator != "cuvs": + raise ValueError("cuVS acceleration only supports accelerator='cuvs'") + if trained_index is None: + raise ValueError( + "one_pass_assign_ivf_pq_on_cuvs requires a trained cuVS index for " + "single-node transform" + ) + if dst_dataset_uri is None: + dst_dataset_uri = tempfile.mkdtemp() + artifact_files = _assign_ivf_pq_on_cuvs_rust_impl( + _unwrap_dataset(dataset), + column, + trained_index, + str(dst_dataset_uri), + batch_size=batch_size, + filter_nan=filter_nan, + ) + LOGGER.info("Saved precomputed partition artifact to %s", dst_dataset_uri) + return str(dst_dataset_uri), artifact_files if accelerator != "cuvs": raise ValueError("cuVS acceleration only supports accelerator='cuvs'") @@ -346,7 +501,7 @@ def one_pass_assign_ivf_pq_on_cuvs( output_schema = pa.schema( [ - pa.field("row_id", pa.uint64()), + pa.field(PARTITION_ARTIFACT_ROW_ID_COLUMN, pa.uint64()), pa.field("__ivf_part_id", pa.uint32()), pa.field("__pq_code", pa.list_(pa.uint8(), list_size=num_sub_vectors)), ] @@ -399,28 +554,19 @@ def _partition_and_pq_codes_assignment() -> Iterator[pa.RecordBatch]: if dst_dataset_uri is None: dst_dataset_uri = tempfile.mkdtemp() - if re.search(r".:\\", dst_dataset_uri) is not None: - dst_dataset_uri = dst_dataset_uri.replace("\\", "/", 1) - - reader = pa.RecordBatchReader.from_batches( - output_schema, _partition_and_pq_codes_assignment() - ) - ds = write_dataset( - reader, - dst_dataset_uri, - schema=output_schema, - data_storage_version="2.2", - ) - _annotate_precomputed_encoded_dataset( - ds, partition_sizes.astype(int).tolist() + artifact_root, artifact_files = _write_partition_artifact( + _partition_and_pq_codes_assignment(), + artifact_root=dst_dataset_uri, + ivf_centroids=ivf_centroids, + pq_codebook=pq_codebook, + metric_type=metric_type, + num_bits=8, + num_partitions=num_partitions, ) - shuffle_buffers = [ - data_file.path for frag in ds.get_fragments() for data_file in frag.data_files() - ] progress.close() - LOGGER.info("Saved precomputed pq_codes to %s", dst_dataset_uri) - return str(dst_dataset_uri), shuffle_buffers + LOGGER.info("Saved precomputed partition artifact to %s", artifact_root) + return str(artifact_root), artifact_files def train_ivf_pq_on_cuvs( diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index be5f7cd2c6f..415ffdb8865 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -2985,7 +2985,7 @@ def _create_index_impl( ) LOGGER.info("cuVS ivf+pq training time: %ss", ivfpq_train_time) timers["ivf+pq_assign:start"] = time.time() - shuffle_output_dir, _ = one_pass_assign_ivf_pq_on_cuvs( + artifact_root, _ = one_pass_assign_ivf_pq_on_cuvs( self, column[0], metric, @@ -2996,12 +2996,12 @@ def _create_index_impl( batch_size=1024 * 128, filter_nan=filter_nan, ) + kwargs["precomputed_partition_artifact_uri"] = artifact_root timers["ivf+pq_assign:end"] = time.time() ivfpq_assign_time = ( timers["ivf+pq_assign:end"] - timers["ivf+pq_assign:start"] ) LOGGER.info("cuVS ivf+pq transform time: %ss", ivfpq_assign_time) - kwargs["precomputed_encoded_dataset_uri"] = shuffle_output_dir else: from .vector import ( one_pass_assign_ivf_pq_on_accelerator, @@ -3216,6 +3216,13 @@ def _create_index_impl( "Temporary precomputed encoded dataset stored at %s, you may want to delete it.", kwargs["precomputed_encoded_dataset_uri"], ) + if "precomputed_partition_artifact_uri" in kwargs.keys() and os.path.exists( + kwargs["precomputed_partition_artifact_uri"] + ): + LOGGER.info( + "Temporary precomputed partition artifact stored at %s, you may want to delete it.", + kwargs["precomputed_partition_artifact_uri"], + ) return index def create_index( diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi index f0be29f39ca..d377f381246 100644 --- a/python/python/lance/lance/__init__.pyi +++ b/python/python/lance/lance/__init__.pyi @@ -135,6 +135,20 @@ class LanceFileSession: def upload_file(self, local_path: str, remote_path: str) -> None: ... def download_file(self, remote_path: str, local_path: str) -> None: ... +class PartitionArtifactBuilder: + def __init__( + self, + uri_or_path: str, + num_partitions: int, + pq_code_width: int, + storage_options: Optional[Dict[str, str]] = None, + storage_options_provider: Optional[StorageOptionsProvider] = None, + ): ... + def append_batch(self, batch: pa.RecordBatch) -> None: ... + def finish( + self, metadata_file: str, total_loss: Optional[float] = None + ) -> List[str]: ... + class LanceFileReader: def __init__( self, diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 1d17423bd68..c2b42de2ac1 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -22,12 +22,18 @@ import pytest from lance import LanceDataset, LanceFragment from lance.dataset import VectorIndexReader +from lance.file import LanceFileReader from lance.indices import IndexFileVersion, IndicesBuilder from lance.query import MatchQuery, PhraseQuery from lance.util import validate_vector_index # noqa: E402 from lance.vector import vec_to_table # noqa: E402 +def _disable_rust_cuvs_backend(monkeypatch): + monkeypatch.setattr(lance_cuvs, "_train_ivf_pq_on_cuvs_rust_impl", None) + monkeypatch.setattr(lance_cuvs, "_assign_ivf_pq_on_cuvs_rust_impl", None) + + def create_table(nvec=1000, ndim=128, nans=0, nullify=False, dtype=np.float32): mat = np.random.randn(nvec, ndim) if nans > 0: @@ -585,8 +591,8 @@ def fake_assign( accelerator, ivf_centroids, pq_codebook, - trained_index=None, - dst_dataset_uri=None, + trained_index, + dst_path=None, batch_size=20480, *, filter_nan, @@ -595,36 +601,23 @@ def fake_assign( calls["assign_column"] = column calls["assign_metric_type"] = metric_type calls["assign_accelerator"] = accelerator + calls["assign_ivf_centroids"] = ivf_centroids + calls["assign_pq_codebook"] = pq_codebook calls["assign_trained_index"] = trained_index calls["assign_batch_size"] = batch_size calls["assign_filter_nan"] = filter_nan - - row_ids = dataset_arg.to_table(columns=[], with_row_id=True)[ - "_rowid" - ].to_numpy() - part_ids = pa.array(np.zeros(len(row_ids), dtype=np.uint32)) - pq_values = pa.array(np.zeros(len(row_ids) * 16, dtype=np.uint8)) - pq_codes = pa.FixedSizeListArray.from_arrays(pq_values, 16) - shuffle_ds_uri = str(tmp_path / "cuvs_shuffle_buffers") - shuffle_ds = lance.write_dataset( - pa.Table.from_arrays( - [pa.array(row_ids), part_ids, pq_codes], - names=["row_id", "__ivf_part_id", "__pq_code"], - ), - shuffle_ds_uri, - ) - lance_cuvs._annotate_precomputed_encoded_dataset( - shuffle_ds, [len(row_ids), 0, 0, 0] - ) - shuffle_buffers = [ - data_file.path - for frag in shuffle_ds.get_fragments() - for data_file in frag.data_files() + return str(tmp_path / "cuvs_artifact"), [ + "manifest.json", + "metadata.lance", + "partitions/bucket-00000.lance", ] - return shuffle_ds_uri, shuffle_buffers monkeypatch.setattr(lance_cuvs, "_train_ivf_pq_index_on_cuvs", fake_train) - monkeypatch.setattr(lance_cuvs, "one_pass_assign_ivf_pq_on_cuvs", fake_assign) + monkeypatch.setattr( + lance_cuvs, + "one_pass_assign_ivf_pq_on_cuvs", + fake_assign, + ) dataset = dataset.create_index( "vector", @@ -712,6 +705,7 @@ def fake_prepare( def test_train_ivf_pq_on_cuvs_nullable_vectors(tmp_path, monkeypatch): + _disable_rust_cuvs_backend(monkeypatch) tbl = create_table(nvec=32, ndim=16, nullify=True) dataset = lance.write_dataset(tbl, tmp_path) @@ -747,9 +741,59 @@ def build(build_params, matrix): assert pq_codebook.shape == (4, 256, 4) +def test_train_ivf_pq_on_cuvs_prefers_rust_backend(tmp_path, monkeypatch): + dataset = lance.write_dataset(create_table(nvec=32, ndim=16), tmp_path) + calls = {} + + class FakeRustIndex: + pass + + def fake_train(*args, **kwargs): + calls["args"] = args + calls["kwargs"] = kwargs + return ( + FakeRustIndex(), + pa.FixedSizeListArray.from_arrays( + pa.array(np.arange(64, dtype=np.float32)), 16 + ), + pa.FixedSizeListArray.from_arrays( + pa.array(np.arange(4 * 256 * 4, dtype=np.float32)), 4 + ), + ) + + monkeypatch.setattr(lance_cuvs, "_train_ivf_pq_on_cuvs_rust_impl", fake_train) + monkeypatch.setattr(lance_cuvs, "_assign_ivf_pq_on_cuvs_rust_impl", object()) + monkeypatch.setattr( + lance_cuvs, + "_require_cuvs", + lambda: (_ for _ in ()).throw(AssertionError("python cuVS backend should not run")), + ) + + trained_index, centroids, pq_codebook = lance_cuvs._train_ivf_pq_index_on_cuvs( + dataset, + "vector", + 4, + "l2", + "cuvs", + 4, + sample_rate=8, + max_iters=30, + num_bits=8, + filter_nan=True, + ) + + assert isinstance(trained_index, FakeRustIndex) + assert calls["args"][:5] == (dataset, "vector", 4, "l2", 4) + assert calls["kwargs"]["sample_rate"] == 8 + assert calls["kwargs"]["max_iters"] == 30 + assert isinstance(centroids, pa.FixedSizeListArray) + assert isinstance(pq_codebook, pa.FixedSizeListArray) + + def test_train_ivf_pq_on_cuvs_uses_num_sub_vectors_for_pq_dim( tmp_path, monkeypatch ): + _disable_rust_cuvs_backend(monkeypatch) dataset = lance.write_dataset(create_table(nvec=32, ndim=16), tmp_path) calls = {} @@ -836,7 +880,8 @@ def make_table(partition_ids: list[int], row_id_start: int): assert partition_fragments == [[0], [0], [1], [1]] -def test_one_pass_assign_ivf_pq_on_cuvs_writes_encoded_dataset(tmp_path, monkeypatch): +def test_one_pass_assign_ivf_pq_on_cuvs_writes_partition_artifact(tmp_path, monkeypatch): + _disable_rust_cuvs_backend(monkeypatch) tbl = create_table(nvec=32, ndim=16) dataset = lance.write_dataset(tbl, tmp_path / "cuvs_assign_src") @@ -875,7 +920,7 @@ def transform(index, vectors): monkeypatch.setattr(lance_cuvs, "_require_cuvs", lambda: FakeIvfPqModule()) monkeypatch.setattr(lance_cuvs, "_optional_cupy", lambda: FakeCupyModule()) - shuffle_uri, shuffle_buffers = lance_cuvs.one_pass_assign_ivf_pq_on_cuvs( + artifact_root, artifact_files = lance_cuvs.one_pass_assign_ivf_pq_on_cuvs( dataset, "vector", "l2", @@ -886,33 +931,82 @@ def transform(index, vectors): batch_size=8, ) - shuffle_ds = lance.dataset(shuffle_uri) - data_batch = next(shuffle_ds.to_batches(batch_size=1024)) + manifest_path = Path(artifact_root) / lance_cuvs.PARTITION_ARTIFACT_MANIFEST_FILE_NAME + metadata_path = Path(artifact_root) / lance_cuvs.PARTITION_ARTIFACT_METADATA_FILE_NAME - assert len(shuffle_buffers) > 0 - assert all(path.endswith(".lance") for path in shuffle_buffers) - assert data_batch.column("row_id").type == pa.uint64() - assert data_batch.column("__ivf_part_id").type == pa.uint32() - assert data_batch.column("__pq_code").type == pa.list_(pa.uint8(), 4) - metadata = shuffle_ds.metadata() - assert json.loads( - metadata[ - lance_cuvs.PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY - ] - ) == [8, 8, 8, 8] - partition_fragments = json.loads( - metadata[ - lance_cuvs.PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY - ] + assert manifest_path.exists() + assert metadata_path.exists() + assert any(path.endswith(".lance") for path in artifact_files) + + manifest = json.loads(manifest_path.read_text()) + assert manifest["version"] == lance_cuvs.PARTITION_ARTIFACT_MANIFEST_VERSION + assert manifest["num_partitions"] == 4 + assert manifest["metadata_file"] == lance_cuvs.PARTITION_ARTIFACT_METADATA_FILE_NAME + assert [entry["num_rows"] for entry in manifest["partitions"]] == [8, 8, 8, 8] + assert all(entry["path"] for entry in manifest["partitions"]) + assert all(entry["ranges"] for entry in manifest["partitions"]) + + metadata_reader = LanceFileReader(str(metadata_path)) + metadata_table = metadata_reader.read_all().to_table() + assert metadata_table.column("_ivf_centroids").type == pa.list_(pa.list_(pa.float32(), 16)) + assert metadata_table.column("_pq_codebook").type == pa.list_(pa.list_(pa.float32(), 4)) + + bucket_path = Path(artifact_root) / manifest["partitions"][0]["path"] + bucket_reader = LanceFileReader(str(bucket_path)) + bucket_table = bucket_reader.read_all().to_table() + assert bucket_table.column("_rowid").type == pa.uint64() + assert bucket_table.column("__pq_code").type == pa.list_(pa.uint8(), 4) + + +def test_one_pass_assign_ivf_pq_on_cuvs_prefers_rust_backend(tmp_path, monkeypatch): + dataset = lance.write_dataset(create_table(nvec=32, ndim=16), tmp_path / "cuvs_assign_rust") + calls = {} + + class FakeRustIndex: + pass + + def fake_assign(*args, **kwargs): + calls["args"] = args + calls["kwargs"] = kwargs + return ["manifest.json", "metadata.lance", "partitions/bucket-00000.lance"] + + monkeypatch.setattr(lance_cuvs, "_train_ivf_pq_on_cuvs_rust_impl", object()) + monkeypatch.setattr(lance_cuvs, "_assign_ivf_pq_on_cuvs_rust_impl", fake_assign) + monkeypatch.setattr( + lance_cuvs, + "_require_cuvs", + lambda: (_ for _ in ()).throw(AssertionError("python cuVS backend should not run")), + ) + + artifact_root, artifact_files = lance_cuvs.one_pass_assign_ivf_pq_on_cuvs( + dataset, + "vector", + "l2", + "cuvs", + np.random.randn(4, 16).astype(np.float32), + np.random.randn(4, 256, 4).astype(np.float32), + trained_index=FakeRustIndex(), + dst_dataset_uri=tmp_path / "artifact", + batch_size=4096, + ) + + assert artifact_root == str(tmp_path / "artifact") + assert artifact_files[0] == "manifest.json" + assert calls["args"][:4] == ( + dataset, + "vector", + calls["args"][2], + str(tmp_path / "artifact"), ) - assert len(partition_fragments) == 4 - assert all(partition_fragments) + assert isinstance(calls["args"][2], FakeRustIndex) + assert calls["kwargs"]["batch_size"] == 4096 def test_one_pass_assign_ivf_pq_on_cuvs_rejects_incompatible_transform_width( tmp_path, monkeypatch, ): + _disable_rust_cuvs_backend(monkeypatch) tbl = create_table(nvec=32, ndim=128) dataset = lance.write_dataset(tbl, tmp_path / "cuvs_assign_incompatible") diff --git a/python/src/dataset.rs b/python/src/dataset.rs index 62e852c117e..19c3e4ec5d4 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -3375,6 +3375,10 @@ fn prepare_vector_index_params( ivf_params.precomputed_encoded_dataset_uri = Some(uri.to_string()); }; + if let Some(uri) = kwargs.get_item("precomputed_partition_artifact_uri")? { + ivf_params.precomputed_partition_artifact_uri = Some(uri.to_string()); + }; + if let Some(storage_options) = storage_options { ivf_params.storage_options = Some(storage_options); } diff --git a/python/src/file.rs b/python/src/file.rs index da8ba3e76bb..eb830dc4a73 100644 --- a/python/src/file.rs +++ b/python/src/file.rs @@ -18,6 +18,7 @@ use arrow_array::{RecordBatch, RecordBatchReader, UInt32Array}; use arrow_schema::Schema as ArrowSchema; use bytes::Bytes; use futures::stream::StreamExt; +use lance::index::vector::PartitionArtifactBuilder as CorePartitionArtifactBuilder; use lance::io::{ObjectStore, RecordBatchStream}; use lance_core::cache::LanceCache; use lance_core::utils::path::LancePathExt; @@ -370,6 +371,82 @@ impl Drop for LanceFileWriter { } } +#[pyclass] +pub struct PartitionArtifactBuilder { + inner: Arc>, +} + +impl PartitionArtifactBuilder { + #[allow(clippy::too_many_arguments)] + async fn open( + uri_or_path: String, + num_partitions: usize, + pq_code_width: usize, + storage_options: Option>, + storage_options_provider: Option>, + ) -> PyResult { + let (object_store, path) = object_store_from_uri_or_path_with_provider( + uri_or_path, + storage_options, + storage_options_provider, + ) + .await?; + let inner = CorePartitionArtifactBuilder::try_new_with_store( + object_store, + path, + num_partitions, + pq_code_width, + ) + .infer_error()?; + Ok(Self { + inner: Arc::new(Mutex::new(inner)), + }) + } +} + +#[pymethods] +impl PartitionArtifactBuilder { + #[new] + #[pyo3(signature=(uri_or_path, num_partitions, pq_code_width, storage_options=None, storage_options_provider=None))] + #[allow(clippy::too_many_arguments)] + pub fn new( + uri_or_path: String, + num_partitions: usize, + pq_code_width: usize, + storage_options: Option>, + storage_options_provider: Option<&Bound<'_, PyAny>>, + ) -> PyResult { + let provider = storage_options_provider + .map(crate::storage_options::py_object_to_storage_options_provider) + .transpose()?; + rt().block_on( + None, + Self::open( + uri_or_path, + num_partitions, + pq_code_width, + storage_options, + provider, + ), + )? + } + + pub fn append_batch(&self, batch: PyArrowType) -> PyResult<()> { + rt().runtime.block_on(async { + self.inner.lock().await.append_batch(&batch.0).await + }) + .infer_error() + } + + #[pyo3(signature=(metadata_file, total_loss=None))] + pub fn finish(&self, metadata_file: String, total_loss: Option) -> PyResult> { + rt().runtime.block_on(async { + self.inner.lock().await.finish(&metadata_file, total_loss).await + }) + .infer_error() + } +} + pub async fn object_store_from_uri_or_path_no_options( uri_or_path: impl AsRef, ) -> PyResult<(Arc, Path)> { diff --git a/python/src/indices.rs b/python/src/indices.rs index cea7f2a968a..cb8288b51a5 100644 --- a/python/src/indices.rs +++ b/python/src/indices.rs @@ -32,6 +32,8 @@ use pyo3::{ }; use lance::index::DatasetIndexInternalExt; +#[cfg(feature = "cuvs")] +use lance_cuvs::TrainedIvfPqIndex; use crate::fragment::FileFragment; use crate::utils::{PyJson, PyLance}; @@ -155,6 +157,89 @@ impl PyIvfModel { } } +#[cfg(feature = "cuvs")] +#[pyclass(name = "_CuvsIvfPqIndex", module = "lance.indices", unsendable)] +pub struct PyCuvsIvfPqIndex { + inner: TrainedIvfPqIndex, +} + +#[cfg(feature = "cuvs")] +#[pyfunction] +#[allow(clippy::too_many_arguments)] +#[pyo3( + signature=( + dataset, + column, + num_partitions, + distance_type, + num_sub_vectors, + sample_rate=256, + max_iters=50, + num_bits=8, + filter_nan=true + ) +)] +fn _train_ivf_pq_on_cuvs_rust<'py>( + py: Python<'py>, + dataset: &Dataset, + column: &str, + num_partitions: u32, + distance_type: &str, + num_sub_vectors: u32, + sample_rate: u32, + max_iters: u32, + num_bits: u8, + filter_nan: bool, +) -> PyResult<(Py, Bound<'py, PyAny>, Bound<'py, PyAny>)> { + let distance_type = DistanceType::try_from(distance_type).unwrap(); + let trained = rt() + .runtime + .block_on(lance_cuvs::train_ivf_pq( + dataset.ds.as_ref(), + column, + num_partitions as usize, + distance_type, + num_sub_vectors as usize, + sample_rate as usize, + max_iters as usize, + num_bits as usize, + filter_nan, + )) + .infer_error()?; + let ivf_centroids = trained.ivf_centroids().clone().into_data().to_pyarrow(py)?; + let pq_codebook = trained.pq_codebook().clone().into_data().to_pyarrow(py)?; + Ok(( + Py::new(py, PyCuvsIvfPqIndex { inner: trained })?, + ivf_centroids, + pq_codebook, + )) +} + +#[cfg(feature = "cuvs")] +#[pyfunction] +#[pyo3(signature=(dataset, column, trained_index, artifact_root, batch_size=1024 * 128, filter_nan=true))] +fn _assign_ivf_pq_on_cuvs_rust( + py: Python<'_>, + dataset: &Dataset, + column: &str, + trained_index: &PyCuvsIvfPqIndex, + artifact_root: &str, + batch_size: usize, + filter_nan: bool, +) -> PyResult> { + let _ = py; + rt().runtime + .block_on(lance_cuvs::assign_ivf_pq_to_artifact( + dataset.ds.as_ref(), + column, + &trained_index.inner, + artifact_root, + batch_size, + filter_nan, + )) + .infer_error() +} + /// Internal helper to fetch an IVF model for the given index name. async fn do_get_ivf_model(dataset: &Dataset, index_name: &str) -> PyResult { use lance_index::metrics::NoOpMetricsCollector; @@ -716,6 +801,12 @@ pub fn register_indices(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { indices.add_class::()?; indices.add_class::()?; indices.add_wrapped(wrap_pyfunction!(get_ivf_model))?; + #[cfg(feature = "cuvs")] + { + indices.add_class::()?; + indices.add_wrapped(wrap_pyfunction!(_train_ivf_pq_on_cuvs_rust))?; + indices.add_wrapped(wrap_pyfunction!(_assign_ivf_pq_on_cuvs_rust))?; + } m.add_submodule(&indices)?; Ok(()) } diff --git a/python/src/lib.rs b/python/src/lib.rs index 9730f2ba1c5..819e3fddc3e 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -51,7 +51,8 @@ use dataset::{ use env_logger::{Builder, Env}; use file::{ LanceBufferDescriptor, LanceColumnMetadata, LanceFileMetadata, LanceFileReader, - LanceFileStatistics, LanceFileWriter, LancePageMetadata, stable_version, + LanceFileStatistics, LanceFileWriter, LancePageMetadata, PartitionArtifactBuilder, + stable_version, }; use log::Level; use pyo3::exceptions::PyIOError; @@ -258,6 +259,7 @@ fn lance(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/rust/lance-cuvs/Cargo.toml b/rust/lance-cuvs/Cargo.toml new file mode 100644 index 00000000000..a001f82c16f --- /dev/null +++ b/rust/lance-cuvs/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "lance-cuvs" +version = "5.0.0-beta.2" +edition = "2024" +authors = ["Lance Devs "] +license = "Apache-2.0" +rust-version = "1.91" +publish = false + +[dependencies] +arrow = "57.0.0" +arrow-array = "57.0.0" +arrow-buffer = "57.0.0" +arrow-schema = "57.0.0" +cuvs = "26.2.0" +cuvs-sys = "26.2.0" +futures = "0.3" +half = { version = "2.5", default-features = false, features = ["num-traits", "std"] } +lance = { path = "../lance" } +lance-arrow = { path = "../lance-arrow" } +lance-core = { path = "../lance-core" } +lance-file = { path = "../lance-file" } +lance-index = { path = "../lance-index" } +lance-io = { path = "../lance-io" } +lance-linalg = { path = "../lance-linalg" } +log = "0.4" +ndarray = { version = "0.16.1", features = ["matrixmultiply-threading"] } +tokio = { version = "1.48", features = ["rt-multi-thread"] } diff --git a/rust/lance-cuvs/src/lib.rs b/rust/lance-cuvs/src/lib.rs new file mode 100644 index 00000000000..db54ce47f22 --- /dev/null +++ b/rust/lance-cuvs/src/lib.rs @@ -0,0 +1,1237 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::ffi::{CStr, c_void}; +use std::marker::PhantomData; +use std::ptr; +use std::sync::Arc; +use arrow::compute::filter; +use arrow_array::cast::AsArray; +use arrow_array::types::{Float16Type, Float32Type, Float64Type, UInt8Type}; +use arrow_array::{ + Array, FixedSizeListArray, Float32Array, ListArray, RecordBatch, UInt8Array, UInt32Array, + UInt64Array, +}; +use arrow_buffer::{OffsetBuffer, ScalarBuffer}; +use arrow_schema::{DataType, Field, Schema as ArrowSchema}; +use cuvs::Resources; +use futures::TryStreamExt; +use lance::dataset::Dataset; +use lance::index::vector::PartitionArtifactBuilder; +use lance::index::vector::utils::{infer_vector_dim, vector_column_to_fsl}; +use lance_arrow::{FixedSizeListArrayExt, RecordBatchExt}; +use lance_core::{Error, ROW_ID, Result}; +use lance_file::version::LanceFileVersion; +use lance_file::writer::{FileWriter, FileWriterOptions}; +use lance_index::vector::utils::is_finite; +use lance_index::vector::{LOSS_METADATA_KEY, PART_ID_COLUMN, PQ_CODE_COLUMN}; +use lance_linalg::distance::DistanceType; +use log::warn; +use ndarray::{Array2, ArrayView2}; + +const PARTITION_ARTIFACT_METADATA_FILE_NAME: &str = "metadata.lance"; +const PARTITION_ARTIFACT_FILE_VERSION: &str = "2.2"; +const PIPELINE_SLOTS: usize = 2; + +type CudaEventHandle = *mut c_void; + +#[link(name = "cudart")] +unsafe extern "C" { + fn cudaMallocHost(ptr: *mut *mut c_void, size: usize) -> cuvs_sys::cudaError_t; + fn cudaFreeHost(ptr: *mut c_void) -> cuvs_sys::cudaError_t; + fn cudaEventCreate(event: *mut CudaEventHandle) -> cuvs_sys::cudaError_t; + fn cudaEventDestroy(event: CudaEventHandle) -> cuvs_sys::cudaError_t; + fn cudaEventRecord( + event: CudaEventHandle, + stream: cuvs_sys::cudaStream_t, + ) -> cuvs_sys::cudaError_t; + fn cudaEventSynchronize(event: CudaEventHandle) -> cuvs_sys::cudaError_t; +} + +pub struct TrainedIvfPqIndex { + resources: Resources, + index: CuvsIvfPqIndex, + num_partitions: usize, + dimension: usize, + num_sub_vectors: usize, + num_bits: usize, + metric_type: DistanceType, + ivf_centroids: FixedSizeListArray, + pq_codebook: FixedSizeListArray, +} + +impl TrainedIvfPqIndex { + pub fn ivf_centroids(&self) -> &FixedSizeListArray { + &self.ivf_centroids + } + + pub fn pq_codebook(&self) -> &FixedSizeListArray { + &self.pq_codebook + } + + pub fn num_partitions(&self) -> usize { + self.num_partitions + } + + pub fn pq_code_width(&self) -> usize { + self.num_sub_vectors + } + + pub fn metric_type(&self) -> DistanceType { + self.metric_type + } + + pub fn num_bits(&self) -> usize { + self.num_bits + } +} + +struct CuvsIvfPqIndex { + raw: cuvs_sys::cuvsIvfPqIndex_t, +} + +impl CuvsIvfPqIndex { + fn try_new() -> Result { + let mut raw = ptr::null_mut(); + check_cuvs( + unsafe { cuvs_sys::cuvsIvfPqIndexCreate(&mut raw) }, + "create IVF_PQ index", + )?; + Ok(Self { raw }) + } +} + +impl Drop for CuvsIvfPqIndex { + fn drop(&mut self) { + if !self.raw.is_null() { + let _ = unsafe { cuvs_sys::cuvsIvfPqIndexDestroy(self.raw) }; + } + } +} + +enum MatrixBuffer<'a> { + Borrowed { + values: &'a [f32], + rows: usize, + cols: usize, + }, + Owned(Array2), +} + +impl MatrixBuffer<'_> { + fn view(&self) -> Result> { + match self { + Self::Borrowed { values, rows, cols } => ArrayView2::from_shape((*rows, *cols), values) + .map_err(|error| { + Error::io(format!("failed to create borrowed matrix view: {error}")) + }), + Self::Owned(array) => Ok(array.view()), + } + } + + fn rows(&self) -> usize { + match self { + Self::Borrowed { rows, .. } => *rows, + Self::Owned(array) => array.nrows(), + } + } +} + +struct HostTensorView { + shape: Vec, + tensor: cuvs_sys::DLManagedTensor, +} + +impl HostTensorView { + fn try_new(shape: &[usize], data: *mut std::ffi::c_void) -> Self { + let shape = shape.iter().map(|dim| *dim as i64).collect::>(); + let tensor = cuvs_sys::DLManagedTensor { + dl_tensor: cuvs_sys::DLTensor { + data, + device: cuvs_sys::DLDevice { + device_type: cuvs_sys::DLDeviceType::kDLCPU, + device_id: 0, + }, + ndim: shape.len() as i32, + dtype: T::dl_dtype(), + shape: shape.as_ptr() as *mut i64, + strides: ptr::null_mut(), + byte_offset: 0, + }, + manager_ctx: ptr::null_mut(), + deleter: None, + }; + Self { shape, tensor } + } + + fn as_mut_ptr(&mut self) -> *mut cuvs_sys::DLManagedTensor { + debug_assert_eq!(self.shape.len(), self.tensor.dl_tensor.ndim as usize); + &mut self.tensor + } +} + +trait DlElement: Copy + Default { + fn dl_dtype() -> cuvs_sys::DLDataType; +} + +impl DlElement for f32 { + fn dl_dtype() -> cuvs_sys::DLDataType { + cuvs_sys::DLDataType { + code: cuvs_sys::DLDataTypeCode::kDLFloat as u8, + bits: 32, + lanes: 1, + } + } +} + +impl DlElement for u8 { + fn dl_dtype() -> cuvs_sys::DLDataType { + cuvs_sys::DLDataType { + code: cuvs_sys::DLDataTypeCode::kDLUInt as u8, + bits: 8, + lanes: 1, + } + } +} + +impl DlElement for u32 { + fn dl_dtype() -> cuvs_sys::DLDataType { + cuvs_sys::DLDataType { + code: cuvs_sys::DLDataTypeCode::kDLUInt as u8, + bits: 32, + lanes: 1, + } + } +} + +struct DeviceTensor { + shape: Vec, + tensor: cuvs_sys::DLManagedTensor, + capacity_bytes: usize, + resources: cuvs_sys::cuvsResources_t, + _marker: PhantomData, +} + +impl DeviceTensor { + fn try_new(resources: &Resources, shape: &[usize]) -> Result { + let capacity_bytes = shape.iter().product::() * std::mem::size_of::(); + let mut data = ptr::null_mut(); + check_cuvs( + unsafe { cuvs_sys::cuvsRMMAlloc(resources.0, &mut data, capacity_bytes) }, + "allocate device tensor", + )?; + let shape = shape.iter().map(|dim| *dim as i64).collect::>(); + let tensor = cuvs_sys::DLManagedTensor { + dl_tensor: cuvs_sys::DLTensor { + data, + device: cuvs_sys::DLDevice { + device_type: cuvs_sys::DLDeviceType::kDLCUDA, + device_id: 0, + }, + ndim: shape.len() as i32, + dtype: T::dl_dtype(), + shape: shape.as_ptr() as *mut i64, + strides: ptr::null_mut(), + byte_offset: 0, + }, + manager_ctx: ptr::null_mut(), + deleter: None, + }; + Ok(Self { + shape, + tensor, + capacity_bytes, + resources: resources.0, + _marker: PhantomData, + }) + } + + fn as_mut_ptr(&mut self) -> *mut cuvs_sys::DLManagedTensor { + debug_assert_eq!(self.shape.len(), self.tensor.dl_tensor.ndim as usize); + &mut self.tensor + } + + fn set_shape(&mut self, shape: &[usize]) -> Result<()> { + if shape.len() != self.shape.len() { + return Err(Error::io(format!( + "device tensor rank mismatch: expected {}, got {}", + self.shape.len(), + shape.len() + ))); + } + let required_bytes = shape.iter().product::() * std::mem::size_of::(); + if required_bytes > self.capacity_bytes { + return Err(Error::io(format!( + "device tensor capacity {} bytes is smaller than requested shape {:?} ({} bytes)", + self.capacity_bytes, shape, required_bytes + ))); + } + for (dst, src) in self.shape.iter_mut().zip(shape) { + *dst = *src as i64; + } + Ok(()) + } + + fn current_len(&self) -> usize { + self.shape.iter().map(|dim| *dim as usize).product() + } + + fn current_bytes(&self) -> usize { + self.current_len() * std::mem::size_of::() + } + + fn copy_from_host_async(&mut self, resources: &Resources, src: &[T]) -> Result<()> { + let expected_len = self.current_len(); + if src.len() != expected_len { + return Err(Error::io(format!( + "device tensor copy expects {expected_len} elements, got {}", + src.len() + ))); + } + check_cuda( + unsafe { + cuvs_sys::cudaMemcpyAsync( + self.tensor.dl_tensor.data, + src.as_ptr() as *const _, + self.current_bytes(), + cuvs_sys::cudaMemcpyKind_cudaMemcpyDefault, + resources + .get_cuda_stream() + .map_err(|e| Error::io(e.to_string()))?, + ) + }, + "copy host tensor to device", + ) + } + + fn copy_to_host_async(&self, resources: &Resources, dst: &mut [T]) -> Result<()> { + let expected_len = self.current_len(); + if dst.len() != expected_len { + return Err(Error::io(format!( + "device tensor copy expects destination length {expected_len}, got {}", + dst.len() + ))); + } + check_cuda( + unsafe { + cuvs_sys::cudaMemcpyAsync( + dst.as_mut_ptr() as *mut _, + self.tensor.dl_tensor.data, + self.current_bytes(), + cuvs_sys::cudaMemcpyKind_cudaMemcpyDefault, + resources + .get_cuda_stream() + .map_err(|e| Error::io(e.to_string()))?, + ) + }, + "copy device tensor to host", + ) + } +} + +impl Drop for DeviceTensor { + fn drop(&mut self) { + if !self.tensor.dl_tensor.data.is_null() { + let _ = unsafe { + cuvs_sys::cuvsRMMFree( + self.resources, + self.tensor.dl_tensor.data, + self.capacity_bytes, + ) + }; + } + } +} + +struct PinnedHostBuffer { + ptr: *mut T, + len: usize, + _marker: PhantomData, +} + +impl PinnedHostBuffer { + fn try_new(len: usize) -> Result { + let bytes = len + .checked_mul(std::mem::size_of::()) + .ok_or_else(|| Error::io("pinned host allocation size overflow"))?; + let mut raw = ptr::null_mut(); + check_cuda( + unsafe { cudaMallocHost(&mut raw, bytes) }, + "allocate pinned host buffer", + )?; + Ok(Self { + ptr: raw.cast::(), + len, + _marker: PhantomData, + }) + } + + fn as_slice(&self) -> &[T] { + unsafe { std::slice::from_raw_parts(self.ptr, self.len) } + } + + fn as_mut_slice(&mut self) -> &mut [T] { + unsafe { std::slice::from_raw_parts_mut(self.ptr, self.len) } + } + + fn prefix(&self, len: usize) -> Result<&[T]> { + if len > self.len { + return Err(Error::io(format!( + "pinned host buffer length {} is smaller than requested prefix {}", + self.len, len + ))); + } + Ok(&self.as_slice()[..len]) + } + + fn prefix_mut(&mut self, len: usize) -> Result<&mut [T]> { + if len > self.len { + return Err(Error::io(format!( + "pinned host buffer length {} is smaller than requested prefix {}", + self.len, len + ))); + } + Ok(&mut self.as_mut_slice()[..len]) + } + + fn copy_from_slice(&mut self, src: &[T]) -> Result<()> { + if src.len() > self.len { + return Err(Error::io(format!( + "pinned host buffer length {} is smaller than source length {}", + self.len, + src.len() + ))); + } + self.prefix_mut(src.len())?.copy_from_slice(src); + Ok(()) + } +} + +impl Drop for PinnedHostBuffer { + fn drop(&mut self) { + if !self.ptr.is_null() { + let _ = unsafe { cudaFreeHost(self.ptr.cast::()) }; + } + } +} + +struct CudaEvent { + raw: CudaEventHandle, +} + +impl CudaEvent { + fn try_new() -> Result { + let mut raw = ptr::null_mut(); + check_cuda(unsafe { cudaEventCreate(&mut raw) }, "create CUDA event")?; + Ok(Self { raw }) + } + + fn record(&self, stream: cuvs_sys::cudaStream_t) -> Result<()> { + check_cuda( + unsafe { cudaEventRecord(self.raw, stream) }, + "record CUDA event", + ) + } + + fn synchronize(&self) -> Result<()> { + check_cuda( + unsafe { cudaEventSynchronize(self.raw) }, + "synchronize CUDA event", + ) + } + +} + +impl Drop for CudaEvent { + fn drop(&mut self) { + if !self.raw.is_null() { + let _ = unsafe { cudaEventDestroy(self.raw) }; + } + } +} + +fn check_cuvs(status: cuvs_sys::cuvsError_t, context: &str) -> Result<()> { + if status == cuvs_sys::cuvsError_t::CUVS_SUCCESS { + return Ok(()); + } + + let message = unsafe { + let text = cuvs_sys::cuvsGetLastErrorText(); + if text.is_null() { + format!("{status:?}") + } else { + format!( + "{status:?}: {}", + CStr::from_ptr(text).to_string_lossy().into_owned() + ) + } + }; + Err(Error::io(format!("cuVS failed to {context}: {message}"))) +} + +fn check_cuda(status: cuvs_sys::cudaError_t, context: &str) -> Result<()> { + if status == cuvs_sys::cudaError::cudaSuccess { + Ok(()) + } else { + Err(Error::io(format!("CUDA failed to {context}: {status:?}"))) + } +} + +fn cuvs_distance_type(metric_type: DistanceType) -> Result { + match metric_type { + DistanceType::L2 => Ok(cuvs_sys::cuvsDistanceType::L2Expanded), + DistanceType::Cosine => Ok(cuvs_sys::cuvsDistanceType::CosineExpanded), + DistanceType::Dot => Ok(cuvs_sys::cuvsDistanceType::InnerProduct), + other => Err(Error::not_supported(format!( + "cuVS IVF_PQ does not support metric {other:?}" + ))), + } +} + +fn create_index_params( + metric_type: DistanceType, + num_partitions: usize, + num_sub_vectors: usize, + sample_rate: usize, + max_iters: usize, + num_bits: usize, +) -> Result { + let mut params = ptr::null_mut(); + check_cuvs( + unsafe { cuvs_sys::cuvsIvfPqIndexParamsCreate(&mut params) }, + "allocate IVF_PQ index params", + )?; + let metric = cuvs_distance_type(metric_type)?; + unsafe { + (*params).metric = metric; + (*params).metric_arg = 0.0; + (*params).add_data_on_build = false; + (*params).n_lists = num_partitions as u32; + (*params).kmeans_n_iters = max_iters as u32; + (*params).kmeans_trainset_fraction = 1.0; + (*params).pq_bits = num_bits as u32; + (*params).pq_dim = num_sub_vectors as u32; + (*params).codebook_kind = + cuvs_sys::cuvsIvfPqCodebookGen::CUVS_IVF_PQ_CODEBOOK_GEN_PER_SUBSPACE; + (*params).force_random_rotation = false; + (*params).conservative_memory_allocation = false; + (*params).max_train_points_per_pq_code = sample_rate as u32; + (*params).codes_layout = cuvs_sys::cuvsIvfPqListLayout::CUVS_IVF_PQ_LIST_LAYOUT_FLAT; + } + Ok(params) +} + +fn destroy_index_params(params: cuvs_sys::cuvsIvfPqIndexParams_t) { + if !params.is_null() { + let _ = unsafe { cuvs_sys::cuvsIvfPqIndexParamsDestroy(params) }; + } +} + +fn make_tensor_view() -> HostTensorView { + let shape = Vec::new(); + let tensor = cuvs_sys::DLManagedTensor { + dl_tensor: cuvs_sys::DLTensor { + data: ptr::null_mut(), + device: cuvs_sys::DLDevice { + device_type: cuvs_sys::DLDeviceType::kDLCPU, + device_id: 0, + }, + ndim: 0, + dtype: ::dl_dtype(), + shape: shape.as_ptr() as *mut i64, + strides: ptr::null_mut(), + byte_offset: 0, + }, + manager_ctx: ptr::null_mut(), + deleter: None, + }; + HostTensorView { shape, tensor } +} + +fn tensor_shape(tensor: &cuvs_sys::DLManagedTensor) -> Vec { + let dl_tensor = &tensor.dl_tensor; + (0..dl_tensor.ndim) + .map(|idx| unsafe { *dl_tensor.shape.add(idx as usize) as usize }) + .collect() +} + +fn tensor_num_bytes(tensor: &cuvs_sys::DLManagedTensor) -> usize { + let shape = tensor_shape(tensor); + let numel = shape.into_iter().product::(); + numel * ((tensor.dl_tensor.dtype.bits as usize) / 8) +} + +fn copy_tensor_to_host_f32_2d( + resources: &Resources, + tensor: &cuvs_sys::DLManagedTensor, +) -> Result> { + let shape = tensor_shape(tensor); + if shape.len() != 2 { + return Err(Error::io(format!( + "expected 2D tensor, got shape {shape:?}" + ))); + } + let mut array = Array2::::zeros((shape[0], shape[1])); + check_cuda( + unsafe { + cuvs_sys::cudaMemcpyAsync( + array.as_mut_ptr() as *mut _, + tensor.dl_tensor.data, + tensor_num_bytes(tensor), + cuvs_sys::cudaMemcpyKind_cudaMemcpyDefault, + resources + .get_cuda_stream() + .map_err(|e| Error::io(e.to_string()))?, + ) + }, + "copy tensor to host", + )?; + resources + .sync_stream() + .map_err(|e| Error::io(e.to_string()))?; + Ok(array) +} + +fn copy_tensor_to_host_f32_3d( + resources: &Resources, + tensor: &cuvs_sys::DLManagedTensor, +) -> Result<(Vec, [usize; 3])> { + let shape = tensor_shape(tensor); + if shape.len() != 3 { + return Err(Error::io(format!( + "expected 3D tensor, got shape {shape:?}" + ))); + } + let mut values = vec![0.0f32; shape.iter().product()]; + check_cuda( + unsafe { + cuvs_sys::cudaMemcpyAsync( + values.as_mut_ptr() as *mut _, + tensor.dl_tensor.data, + tensor_num_bytes(tensor), + cuvs_sys::cudaMemcpyKind_cudaMemcpyDefault, + resources + .get_cuda_stream() + .map_err(|e| Error::io(e.to_string()))?, + ) + }, + "copy tensor to host", + )?; + resources + .sync_stream() + .map_err(|e| Error::io(e.to_string()))?; + Ok((values, [shape[0], shape[1], shape[2]])) +} + +fn infer_dimension(dataset: &Dataset, column: &str) -> Result { + let field = dataset.schema().field(column).ok_or_else(|| { + Error::invalid_input(format!( + "column '{column}' does not exist in dataset schema" + )) + })?; + infer_vector_dim(&field.data_type()) +} + +fn matrix_from_vectors<'a>(vectors: &'a FixedSizeListArray) -> Result> { + let dim = vectors.value_length() as usize; + match vectors.value_type() { + DataType::Float32 => { + let values = vectors.values().as_primitive::(); + let values: &[f32] = values.values().as_ref(); + Ok(MatrixBuffer::Borrowed { + values, + rows: vectors.len(), + cols: dim, + }) + } + DataType::Float16 => { + let values = vectors.values().as_primitive::(); + let data = values + .values() + .iter() + .map(|value| value.to_f32()) + .collect::>(); + Ok(MatrixBuffer::Owned( + Array2::from_shape_vec((vectors.len(), dim), data).map_err(|error| { + Error::io(format!("failed to create float16 matrix copy: {error}")) + })?, + )) + } + DataType::Float64 => { + let values = vectors.values().as_primitive::(); + let data = values + .values() + .iter() + .map(|value| *value as f32) + .collect::>(); + Ok(MatrixBuffer::Owned( + Array2::from_shape_vec((vectors.len(), dim), data).map_err(|error| { + Error::io(format!("failed to create float64 matrix copy: {error}")) + })?, + )) + } + other => Err(Error::not_supported(format!( + "cuVS IVF_PQ currently supports float16/float32/float64 vectors, got {other}" + ))), + } +} + +fn ivf_centroids_from_host(array: Array2) -> Result { + let dim = array.ncols() as i32; + let values = Float32Array::from_iter_values(array.into_iter()); + Ok(FixedSizeListArray::try_new_from_values(values, dim)?) +} + +fn pq_codebook_from_host( + values: Vec, + shape: [usize; 3], + num_sub_vectors: usize, + dimension: usize, + num_bits: usize, +) -> Result { + let pq_book_size = 1usize << num_bits; + let subvector_dim = dimension / num_sub_vectors; + let expected = [num_sub_vectors, subvector_dim, pq_book_size]; + if shape != expected { + return Err(Error::io(format!( + "cuVS returned incompatible PQ codebook shape: expected {expected:?}, got {shape:?}" + ))); + } + + let mut flattened = Vec::with_capacity(values.len()); + for subspace in 0..num_sub_vectors { + for centroid in 0..pq_book_size { + for component in 0..subvector_dim { + let source_idx = ((subspace * subvector_dim + component) * pq_book_size) + centroid; + flattened.push(values[source_idx]); + } + } + } + + Ok(FixedSizeListArray::try_new_from_values( + Float32Array::from(flattened), + subvector_dim as i32, + )?) +} + +fn build_metadata_batch( + ivf_centroids: &FixedSizeListArray, + pq_codebook: &FixedSizeListArray, +) -> Result { + let ivf_offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0i32, ivf_centroids.len() as i32])); + let pq_offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0i32, pq_codebook.len() as i32])); + let ivf_list = ListArray::new( + Arc::new(Field::new( + "_ivf_centroids_item", + ivf_centroids.data_type().clone(), + false, + )), + ivf_offsets, + Arc::new(ivf_centroids.clone()), + None, + ); + let pq_list = ListArray::new( + Arc::new(Field::new( + "_pq_codebook_item", + pq_codebook.data_type().clone(), + false, + )), + pq_offsets, + Arc::new(pq_codebook.clone()), + None, + ); + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("_ivf_centroids", ivf_list.data_type().clone(), false), + Field::new("_pq_codebook", pq_list.data_type().clone(), false), + ])); + Ok(RecordBatch::try_new( + schema, + vec![Arc::new(ivf_list), Arc::new(pq_list)], + )?) +} + +fn metadata_writer_options() -> Result { + Ok(FileWriterOptions { + format_version: Some( + PARTITION_ARTIFACT_FILE_VERSION + .parse::() + .map_err(|error| { + Error::invalid_input(format!( + "invalid partition artifact file version '{}': {}", + PARTITION_ARTIFACT_FILE_VERSION, error + )) + })?, + ), + ..Default::default() + }) +} + +async fn write_partition_artifact_metadata( + artifact_uri: &str, + trained: &TrainedIvfPqIndex, +) -> Result<()> { + let (object_store, root_dir) = lance::io::ObjectStore::from_uri(artifact_uri) + .await + .map_err(|error| Error::io(error.to_string()))?; + let path = root_dir.child(PARTITION_ARTIFACT_METADATA_FILE_NAME); + let batch = build_metadata_batch(&trained.ivf_centroids, &trained.pq_codebook)?; + let mut writer = FileWriter::try_new( + object_store.create(&path).await?, + lance_core::datatypes::Schema::try_from(batch.schema().as_ref())?, + metadata_writer_options()?, + )?; + writer.add_schema_metadata( + "lance:index_build:artifact_version".to_string(), + "1".to_string(), + ); + writer.add_schema_metadata( + "lance:index_build:distance_type".to_string(), + trained.metric_type.to_string(), + ); + writer.add_schema_metadata( + "lance:index_build:num_partitions".to_string(), + trained.num_partitions.to_string(), + ); + writer.add_schema_metadata( + "lance:index_build:num_sub_vectors".to_string(), + trained.num_sub_vectors.to_string(), + ); + writer.add_schema_metadata( + "lance:index_build:num_bits".to_string(), + trained.num_bits.to_string(), + ); + writer.add_schema_metadata( + "lance:index_build:dimension".to_string(), + trained.dimension.to_string(), + ); + writer.write_batch(&batch).await?; + writer.finish().await?; + Ok(()) +} + +fn build_partition_batch( + row_ids: Arc, + partitions: &[u32], + pq_codes: &[u8], + code_width: usize, +) -> Result { + if pq_codes.len() != partitions.len() * code_width { + return Err(Error::io(format!( + "partition artifact batch expects {} PQ codes for {} rows and code width {}, got {}", + partitions.len() * code_width, + partitions.len(), + code_width, + pq_codes.len() + ))); + } + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new(ROW_ID, DataType::UInt64, false), + Field::new(PART_ID_COLUMN, DataType::UInt32, false), + Field::new( + PQ_CODE_COLUMN, + DataType::FixedSizeList( + Arc::new(Field::new("item", DataType::UInt8, true)), + code_width as i32, + ), + true, + ), + ])); + let pq_codes = FixedSizeListArray::try_new_from_values( + UInt8Array::from_iter_values(pq_codes.iter().copied()), + code_width as i32, + )?; + Ok(RecordBatch::try_new( + schema, + vec![ + row_ids, + Arc::new(UInt32Array::from_iter_values(partitions.iter().copied())), + Arc::new(pq_codes), + ], + )?) +} + +fn transform_batch_loss(batch: &RecordBatch) -> f64 { + batch + .metadata() + .get(LOSS_METADATA_KEY) + .and_then(|value| value.parse::().ok()) + .unwrap_or(0.0) +} + +struct TransformSlot { + input_host: PinnedHostBuffer, + input_device: DeviceTensor, + labels_host: PinnedHostBuffer, + labels_device: DeviceTensor, + codes_host: PinnedHostBuffer, + codes_device: DeviceTensor, + h2d_start: CudaEvent, + h2d_done: CudaEvent, + transform_done: CudaEvent, + output_ready: CudaEvent, + row_ids: Option>, + rows: usize, +} + +impl TransformSlot { + fn try_new( + resources: &Resources, + max_rows: usize, + dimension: usize, + code_width: usize, + ) -> Result { + Ok(Self { + input_host: PinnedHostBuffer::try_new(max_rows * dimension)?, + input_device: DeviceTensor::try_new(resources, &[max_rows, dimension])?, + labels_host: PinnedHostBuffer::try_new(max_rows)?, + labels_device: DeviceTensor::try_new(resources, &[max_rows])?, + codes_host: PinnedHostBuffer::try_new(max_rows * code_width)?, + codes_device: DeviceTensor::try_new(resources, &[max_rows, code_width])?, + h2d_start: CudaEvent::try_new()?, + h2d_done: CudaEvent::try_new()?, + transform_done: CudaEvent::try_new()?, + output_ready: CudaEvent::try_new()?, + row_ids: None, + rows: 0, + }) + } + + fn has_pending_output(&self) -> bool { + self.row_ids.is_some() + } + + fn launch( + &mut self, + trained: &TrainedIvfPqIndex, + stream: cuvs_sys::cudaStream_t, + row_ids: Arc, + matrix: &[f32], + rows: usize, + dimension: usize, + ) -> Result<()> { + let code_width = trained.pq_code_width(); + self.input_host.copy_from_slice(matrix)?; + self.input_device.set_shape(&[rows, dimension])?; + self.labels_device.set_shape(&[rows])?; + self.codes_device.set_shape(&[rows, code_width])?; + self.rows = rows; + self.row_ids = Some(row_ids); + + self.h2d_start.record(stream)?; + self.input_device.copy_from_host_async( + &trained.resources, + self.input_host.prefix(rows * dimension)?, + )?; + self.h2d_done.record(stream)?; + check_cuvs( + unsafe { + cuvs_sys::cuvsIvfPqTransform( + trained.resources.0, + trained.index.raw, + self.input_device.as_mut_ptr(), + self.labels_device.as_mut_ptr(), + self.codes_device.as_mut_ptr(), + ) + }, + "transform vectors with IVF_PQ", + )?; + self.transform_done.record(stream)?; + self.labels_device + .copy_to_host_async(&trained.resources, self.labels_host.prefix_mut(rows)?)?; + self.codes_device.copy_to_host_async( + &trained.resources, + self.codes_host.prefix_mut(rows * code_width)?, + )?; + self.output_ready.record(stream)?; + Ok(()) + } + + fn drain_to_batch(&mut self, code_width: usize) -> Result> { + if !self.has_pending_output() { + return Ok(None); + } + + self.output_ready.synchronize()?; + let row_ids = self + .row_ids + .take() + .ok_or_else(|| Error::io("transform slot is missing row ids"))?; + let batch = build_partition_batch( + row_ids, + self.labels_host.prefix(self.rows)?, + self.codes_host.prefix(self.rows * code_width)?, + code_width, + )?; + self.rows = 0; + Ok(Some(batch)) + } +} + +async fn for_each_transformed_batch( + dataset: &Dataset, + column: &str, + trained: &TrainedIvfPqIndex, + batch_size: usize, + filter_nan: bool, + mut on_batch: F, +) -> Result<()> +where + F: FnMut(RecordBatch) -> Fut, + Fut: std::future::Future>, +{ + let code_width = trained.pq_code_width(); + let mut scanner = dataset.scan(); + scanner.project(&[column])?; + if dataset + .schema() + .field(column) + .is_some_and(|field| field.nullable && filter_nan) + { + scanner.filter(&format!("{column} is not null"))?; + } + scanner.with_row_id(); + scanner.batch_size(batch_size); + let mut stream = scanner.try_into_stream().await?; + let cuda_stream = trained + .resources + .get_cuda_stream() + .map_err(|error| Error::io(error.to_string()))?; + let mut slots = (0..PIPELINE_SLOTS) + .map(|_| { + TransformSlot::try_new( + &trained.resources, + batch_size, + trained.dimension, + code_width, + ) + }) + .collect::>>()?; + let mut next_slot = 0usize; + + loop { + let Some(batch) = stream.try_next().await? else { + break; + }; + let slot = &mut slots[next_slot]; + if let Some(transformed) = slot.drain_to_batch(code_width)? { + on_batch(transformed).await?; + } + + let vectors = vector_column_to_fsl(&batch, column)?; + let row_ids = batch + .column_by_name(ROW_ID) + .ok_or_else(|| Error::invalid_input(format!("transform batch is missing {ROW_ID}")))?; + let finite_mask = is_finite(&vectors); + let valid_rows = finite_mask.true_count(); + if valid_rows == 0 { + continue; + } + if valid_rows != vectors.len() { + warn!( + "{} vectors are ignored during partition assignment because they are null or non-finite", + vectors.len() - valid_rows + ); + } + + let filtered_row_ids = if valid_rows == row_ids.len() { + row_ids.clone() + } else { + filter(row_ids.as_ref(), &finite_mask)? + }; + let filtered_vectors = if valid_rows == vectors.len() { + vectors + } else { + let vector_column = batch.column_by_name(column).ok_or_else(|| { + Error::invalid_input(format!( + "transform batch is missing vector column '{column}'" + )) + })?; + let field = batch + .schema() + .field_with_name(column) + .map_err(|_| { + Error::invalid_input(format!( + "transform batch schema is missing field '{column}'" + )) + })? + .clone(); + let filtered_vectors = filter(vector_column.as_ref(), &finite_mask)?; + vector_column_to_fsl( + &RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![field])), + vec![filtered_vectors], + )?, + column, + )? + }; + + let matrix = matrix_from_vectors(&filtered_vectors)?; + let matrix_view = matrix.view()?; + let input_slice = matrix_view + .as_slice_memory_order() + .ok_or_else(|| Error::io("transform matrix is not contiguous"))?; + + slot.launch( + trained, + cuda_stream, + filtered_row_ids, + input_slice, + matrix.rows(), + matrix_view.ncols(), + )?; + next_slot = (next_slot + 1) % PIPELINE_SLOTS; + } + + for slot in &mut slots { + if let Some(transformed) = slot.drain_to_batch(code_width)? { + on_batch(transformed).await?; + } + } + Ok(()) +} + +pub async fn train_ivf_pq( + dataset: &Dataset, + column: &str, + num_partitions: usize, + metric_type: DistanceType, + num_sub_vectors: usize, + sample_rate: usize, + max_iters: usize, + num_bits: usize, + filter_nan: bool, +) -> Result { + if num_bits != 8 { + return Err(Error::not_supported( + "cuVS IVF_PQ currently supports only num_bits=8", + )); + } + + let dimension = infer_dimension(dataset, column)?; + if dimension % num_sub_vectors != 0 { + return Err(Error::invalid_input(format!( + "cuVS IVF_PQ requires vector dimension {} to be divisible by num_sub_vectors {}", + dimension, num_sub_vectors + ))); + } + + let num_rows = dataset.count_rows(None).await?; + if num_rows == 0 { + return Err(Error::invalid_input( + "cuVS training requires at least one training vector", + )); + } + let train_rows = num_rows + .min((num_partitions * sample_rate).max(256 * 256)) + .max(1); + let train_vectors = if filter_nan { + let batch = dataset.scan().project(&[column])?.try_into_batch().await?; + let vectors = vector_column_to_fsl(&batch, column)?; + let mask = is_finite(&vectors); + let filtered = filter(&vectors, &mask)?.as_fixed_size_list().clone(); + filtered.slice(0, train_rows.min(filtered.len())) + } else { + let projection = dataset.schema().project(&[column])?; + let batch = dataset.sample(train_rows, &projection, None).await?; + vector_column_to_fsl(&batch, column)? + }; + if train_vectors.is_empty() { + return Err(Error::invalid_input( + "cuVS training requires at least one non-null training vector", + )); + } + + let matrix = matrix_from_vectors(&train_vectors)?; + let resources = Resources::new().map_err(|error| Error::io(error.to_string()))?; + let index = CuvsIvfPqIndex::try_new()?; + let params = create_index_params( + metric_type, + num_partitions, + num_sub_vectors, + sample_rate, + max_iters, + num_bits, + )?; + let matrix_view = matrix.view()?; + let mut dataset_tensor = HostTensorView::try_new::( + &[matrix_view.nrows(), matrix_view.ncols()], + matrix_view.as_ptr() as *mut std::ffi::c_void, + ); + + let build_result = check_cuvs( + unsafe { + cuvs_sys::cuvsIvfPqBuild(resources.0, params, dataset_tensor.as_mut_ptr(), index.raw) + }, + "build IVF_PQ index", + ); + destroy_index_params(params); + build_result?; + + let mut centers = make_tensor_view(); + check_cuvs( + unsafe { cuvs_sys::cuvsIvfPqIndexGetCenters(index.raw, centers.as_mut_ptr()) }, + "get IVF centroids", + )?; + let ivf_centroids = + ivf_centroids_from_host(copy_tensor_to_host_f32_2d(&resources, ¢ers.tensor)?)?; + + let mut pq_centers = make_tensor_view(); + check_cuvs( + unsafe { cuvs_sys::cuvsIvfPqIndexGetPqCenters(index.raw, pq_centers.as_mut_ptr()) }, + "get PQ codebook", + )?; + let (pq_codebook_values, pq_codebook_shape) = + copy_tensor_to_host_f32_3d(&resources, &pq_centers.tensor)?; + let pq_codebook = pq_codebook_from_host( + pq_codebook_values, + pq_codebook_shape, + num_sub_vectors, + dimension, + num_bits, + )?; + + Ok(TrainedIvfPqIndex { + resources, + index, + num_partitions, + dimension, + num_sub_vectors, + num_bits, + metric_type, + ivf_centroids, + pq_codebook, + }) +} + +pub async fn assign_ivf_pq_to_artifact( + dataset: &Dataset, + column: &str, + trained: &TrainedIvfPqIndex, + artifact_uri: &str, + batch_size: usize, + filter_nan: bool, +) -> Result> { + let code_width = trained.pq_code_width(); + let builder = Arc::new(tokio::sync::Mutex::new( + PartitionArtifactBuilder::try_new(artifact_uri, trained.num_partitions, code_width, None) + .await?, + )); + for_each_transformed_batch(dataset, column, trained, batch_size, filter_nan, |batch| { + let builder = builder.clone(); + async move { + builder.lock().await.append_batch(&batch).await?; + Ok(()) + } + }) + .await?; + let mut builder = Arc::try_unwrap(builder) + .map_err(|_| Error::io("partition artifact builder still has outstanding references"))? + .into_inner(); + + write_partition_artifact_metadata(artifact_uri, trained).await?; + let mut files = builder.finish(PARTITION_ARTIFACT_METADATA_FILE_NAME, None).await?; + if files.len() > 1 { + files.insert(1, PARTITION_ARTIFACT_METADATA_FILE_NAME.to_string()); + } + Ok(files) +} diff --git a/rust/lance-index/src/vector/ivf/builder.rs b/rust/lance-index/src/vector/ivf/builder.rs index 9154adbcd80..155b33f58b9 100644 --- a/rust/lance-index/src/vector/ivf/builder.rs +++ b/rust/lance-index/src/vector/ivf/builder.rs @@ -53,6 +53,10 @@ pub struct IvfBuildParams { /// Requires `centroids` to be set. pub precomputed_encoded_dataset_uri: Option, + /// Precomputed partitioned artifact produced by an external backend. + /// Mutually exclusive with other precomputed inputs and requires `centroids` to be set. + pub precomputed_partition_artifact_uri: Option, + pub shuffle_partition_batches: usize, pub shuffle_partition_concurrency: usize, @@ -73,6 +77,7 @@ impl Default for IvfBuildParams { precomputed_partitions_file: None, precomputed_shuffle_buffers: None, precomputed_encoded_dataset_uri: None, + precomputed_partition_artifact_uri: None, shuffle_partition_batches: 1024 * 10, shuffle_partition_concurrency: 2, storage_options: None, diff --git a/rust/lance/src/index/vector.rs b/rust/lance/src/index/vector.rs index 13176c3bca8..48235a65582 100644 --- a/rust/lance/src/index/vector.rs +++ b/rust/lance/src/index/vector.rs @@ -10,6 +10,7 @@ use std::{any::Any, collections::HashMap}; pub mod builder; mod encoded_dataset; pub mod ivf; +mod partition_artifact; pub mod pq; pub mod utils; @@ -32,6 +33,7 @@ use lance_index::vector::hnsw::HNSW; use lance_index::vector::ivf::builder::recommended_num_partitions; use lance_index::vector::ivf::storage::IvfModel; use object_store::path::Path; +pub use partition_artifact::PartitionArtifactBuilder; use lance_arrow::FixedSizeListArrayExt; use lance_index::vector::pq::ProductQuantizer; @@ -1657,6 +1659,7 @@ fn derive_ivf_params(ivf_model: &IvfModel) -> IvfBuildParams { precomputed_partitions_file: None, precomputed_shuffle_buffers: None, precomputed_encoded_dataset_uri: None, + precomputed_partition_artifact_uri: None, shuffle_partition_batches: 1024 * 10, // Default shuffle_partition_concurrency: 2, // Default storage_options: None, diff --git a/rust/lance/src/index/vector/builder.rs b/rust/lance/src/index/vector/builder.rs index 258e978f1ac..9c9d4b16eed 100644 --- a/rust/lance/src/index/vector/builder.rs +++ b/rust/lance/src/index/vector/builder.rs @@ -94,6 +94,7 @@ use super::v2::IVFIndex; use super::{ encoded_dataset::EncodedDatasetShuffleReader, ivf::load_precomputed_partitions_if_available, + partition_artifact::PartitionArtifactShuffleReader, utils::{self, get_vector_type}, }; @@ -240,6 +241,19 @@ impl IvfIndexBuilder )) } + async fn try_open_precomputed_partition_artifact_reader( + &self, + uri: &str, + ) -> Result> { + let storage_options = self + .ivf_params + .as_ref() + .and_then(|params| params.storage_options.as_ref()); + Ok(Arc::new( + PartitionArtifactShuffleReader::try_open(uri, storage_options).await?, + )) + } + #[allow(clippy::too_many_arguments)] pub fn new( dataset: Dataset, @@ -622,6 +636,19 @@ impl IvfIndexBuilder return Err(Error::invalid_input("dataset not set before shuffling")); }; + if let Some(uri) = self + .ivf_params + .as_ref() + .and_then(|params| params.precomputed_partition_artifact_uri.as_deref()) + { + log::info!("shuffle with precomputed partition artifact from {}", uri); + self.shuffle_reader = Some( + self.try_open_precomputed_partition_artifact_reader(uri) + .await?, + ); + return Ok(()); + } + if let Some(uri) = self .ivf_params .as_ref() diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs index 1f2b47887db..4841c98d661 100644 --- a/rust/lance/src/index/vector/ivf.rs +++ b/rust/lance/src/index/vector/ivf.rs @@ -1210,6 +1210,12 @@ fn sanity_check_ivf_params(ivf: &IvfBuildParams) -> Result<()> { )); } + if ivf.precomputed_partition_artifact_uri.is_some() && ivf.centroids.is_none() { + return Err(Error::index( + "precomputed_partition_artifact_uri requires centroids to be set".to_string(), + )); + } + if ivf.precomputed_shuffle_buffers.is_some() && ivf.precomputed_partitions_file.is_some() { return Err(Error::index( "precomputed_shuffle_buffers and precomputed_partitions_file are mutually exclusive" @@ -1231,6 +1237,31 @@ fn sanity_check_ivf_params(ivf: &IvfBuildParams) -> Result<()> { )); } + if ivf.precomputed_partition_artifact_uri.is_some() && ivf.precomputed_partitions_file.is_some() + { + return Err(Error::index( + "precomputed_partition_artifact_uri and precomputed_partitions_file are mutually exclusive" + .to_string(), + )); + } + + if ivf.precomputed_partition_artifact_uri.is_some() && ivf.precomputed_shuffle_buffers.is_some() + { + return Err(Error::index( + "precomputed_partition_artifact_uri and precomputed_shuffle_buffers are mutually exclusive" + .to_string(), + )); + } + + if ivf.precomputed_partition_artifact_uri.is_some() + && ivf.precomputed_encoded_dataset_uri.is_some() + { + return Err(Error::index( + "precomputed_partition_artifact_uri and precomputed_encoded_dataset_uri are mutually exclusive" + .to_string(), + )); + } + Ok(()) } @@ -1248,6 +1279,12 @@ fn sanity_check_params(ivf: &IvfBuildParams, pq: &PQBuildParams) -> Result<()> { )); } + if ivf.precomputed_partition_artifact_uri.is_some() && pq.codebook.is_none() { + return Err(Error::index( + "precomputed_partition_artifact_uri requires codebooks to be set".to_string(), + )); + } + Ok(()) } diff --git a/rust/lance/src/index/vector/partition_artifact.rs b/rust/lance/src/index/vector/partition_artifact.rs new file mode 100644 index 00000000000..cfd2a0f2b4a --- /dev/null +++ b/rust/lance/src/index/vector/partition_artifact.rs @@ -0,0 +1,956 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::collections::HashMap; +use std::mem; +use std::ops::Range; +use std::sync::{Arc, Mutex}; + +use arrow_array::cast::AsArray; +use arrow_array::{FixedSizeListArray, RecordBatch, UInt8Array, UInt32Array, UInt64Array}; +use arrow_schema::{DataType, Field, Schema as ArrowSchema}; +use futures::TryStreamExt; +use lance_arrow::FixedSizeListArrayExt; +use lance_core::cache::LanceCache; +use lance_core::datatypes::Schema; +use lance_core::{Error, ROW_ID, Result}; +use lance_encoding::decoder::{DecoderPlugins, FilterExpression}; +use lance_file::reader::{FileReader, FileReaderOptions}; +use lance_file::version::LanceFileVersion; +use lance_file::writer::{FileWriter, FileWriterOptions}; +use lance_index::vector::v3::shuffler::ShuffleReader; +use lance_index::vector::{PART_ID_COLUMN, PQ_CODE_COLUMN}; +use lance_io::ReadBatchParams; +use lance_io::object_store::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry}; +use lance_io::scheduler::{ScanScheduler, SchedulerConfig}; +use lance_io::stream::{RecordBatchStream, RecordBatchStreamAdapter}; +use lance_io::traits::Writer; +use lance_io::utils::CachedFileSize; +use object_store::path::Path; +use serde::{Deserialize, Serialize}; +use tokio::io::AsyncWriteExt; + +const PARTITION_ARTIFACT_MANIFEST_VERSION: u32 = 1; +const PARTITION_ARTIFACT_MANIFEST_FILE_NAME: &str = "manifest.json"; +const PARTITION_ARTIFACT_PARTITIONS_DIR: &str = "partitions"; +const PARTITION_ARTIFACT_DEFAULT_BUCKETS: usize = 256; +const PARTITION_ARTIFACT_STAGING_PREFIX: &str = ".staging-bucket-"; +const PARTITION_ARTIFACT_BUCKET_PREFIX: &str = "bucket-"; +const PARTITION_ARTIFACT_FILE_VERSION: &str = "2.2"; +const PARTITION_ARTIFACT_BUCKET_BUFFER_ROWS: usize = 32 * 1024; + +#[derive(Debug, Serialize, Deserialize)] +struct PartitionArtifactManifest { + version: u32, + num_partitions: usize, + #[serde(default)] + metadata_file: Option, + #[serde(default)] + total_loss: Option, + partitions: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +struct PartitionArtifactPartition { + #[serde(default)] + path: Option, + #[serde(default)] + num_rows: usize, + #[serde(default)] + ranges: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +struct PartitionArtifactRange { + offset: u64, + num_rows: u64, +} + +#[derive(Default, Debug)] +struct BucketBuffer { + row_ids: Vec, + partition_ids: Vec, + pq_values: Vec, +} + +impl BucketBuffer { + fn len(&self) -> usize { + self.row_ids.len() + } + + fn is_empty(&self) -> bool { + self.row_ids.is_empty() + } +} + +pub struct PartitionArtifactBuilder { + object_store: Arc, + root_dir: Path, + num_partitions: usize, + num_buckets: usize, + pq_code_width: usize, + temp_schema: Arc, + final_schema: Arc, + temp_writers: Vec>, + buffers: Vec, +} + +impl PartitionArtifactBuilder { + pub async fn try_new( + uri: &str, + num_partitions: usize, + pq_code_width: usize, + storage_options: Option<&HashMap>, + ) -> Result { + let registry = Arc::new(ObjectStoreRegistry::default()); + let params = if let Some(storage_options) = storage_options { + ObjectStoreParams { + storage_options_accessor: Some(Arc::new( + lance_io::object_store::StorageOptionsAccessor::with_static_options( + storage_options.clone(), + ), + )), + ..Default::default() + } + } else { + ObjectStoreParams::default() + }; + let (object_store, root_dir) = + ObjectStore::from_uri_and_params(registry, uri, ¶ms).await?; + Self::try_new_with_store(object_store, root_dir, num_partitions, pq_code_width) + } + + pub fn try_new_with_store( + object_store: Arc, + root_dir: Path, + num_partitions: usize, + pq_code_width: usize, + ) -> Result { + if num_partitions == 0 { + return Err(Error::invalid_input( + "partition artifact builder requires num_partitions > 0".to_string(), + )); + } + if pq_code_width == 0 { + return Err(Error::invalid_input( + "partition artifact builder requires pq_code_width > 0".to_string(), + )); + } + + let num_buckets = num_partitions + .min(PARTITION_ARTIFACT_DEFAULT_BUCKETS) + .max(1); + let temp_schema = Arc::new(ArrowSchema::new(vec![ + Field::new(ROW_ID, DataType::UInt64, false), + Field::new(PART_ID_COLUMN, DataType::UInt32, false), + Field::new( + PQ_CODE_COLUMN, + DataType::FixedSizeList( + Arc::new(Field::new("item", DataType::UInt8, true)), + pq_code_width as i32, + ), + true, + ), + ])); + let final_schema = Arc::new(ArrowSchema::new(vec![ + Field::new(ROW_ID, DataType::UInt64, false), + Field::new( + PQ_CODE_COLUMN, + DataType::FixedSizeList( + Arc::new(Field::new("item", DataType::UInt8, true)), + pq_code_width as i32, + ), + true, + ), + ])); + + Ok(Self { + object_store, + root_dir, + num_partitions, + num_buckets, + pq_code_width, + temp_schema, + final_schema, + temp_writers: (0..num_buckets).map(|_| None).collect(), + buffers: (0..num_buckets).map(|_| BucketBuffer::default()).collect(), + }) + } + + pub async fn append_batch(&mut self, batch: &RecordBatch) -> Result<()> { + validate_input_batch(batch, self.pq_code_width)?; + + let row_ids = batch[ROW_ID].as_primitive::(); + let part_ids = batch[PART_ID_COLUMN].as_primitive::(); + let pq_codes = batch[PQ_CODE_COLUMN].as_fixed_size_list(); + let pq_values = pq_codes + .values() + .as_primitive::(); + let pq_values = pq_values.values().as_ref(); + + for row_idx in 0..batch.num_rows() { + let partition_id = part_ids.value(row_idx) as usize; + if partition_id >= self.num_partitions { + return Err(Error::invalid_input(format!( + "partition artifact batch contains partition id {} but num_partitions is {}", + partition_id, self.num_partitions + ))); + } + let bucket_id = partition_id % self.num_buckets; + let buffer = &mut self.buffers[bucket_id]; + buffer.row_ids.push(row_ids.value(row_idx)); + buffer.partition_ids.push(partition_id as u32); + let start = row_idx * self.pq_code_width; + let end = start + self.pq_code_width; + buffer.pq_values.extend_from_slice(&pq_values[start..end]); + if buffer.len() >= PARTITION_ARTIFACT_BUCKET_BUFFER_ROWS { + self.flush_bucket(bucket_id).await?; + } + } + Ok(()) + } + + pub async fn finish( + &mut self, + metadata_file: &str, + total_loss: Option, + ) -> Result> { + for bucket_id in 0..self.num_buckets { + self.flush_bucket(bucket_id).await?; + } + for writer in self.temp_writers.iter_mut() { + if let Some(writer) = writer.as_mut() { + writer.finish().await?; + } + } + + let mut partitions = vec![ + PartitionArtifactPartition { + path: None, + num_rows: 0, + ranges: Vec::new(), + }; + self.num_partitions + ]; + let mut artifact_files = Vec::with_capacity(self.num_buckets + 1); + + for bucket_id in 0..self.num_buckets { + if let Some(relative_path) = self.finalize_bucket(bucket_id, &mut partitions).await? { + artifact_files.push(relative_path); + } + } + + let manifest = PartitionArtifactManifest { + version: PARTITION_ARTIFACT_MANIFEST_VERSION, + num_partitions: self.num_partitions, + metadata_file: Some(metadata_file.to_string()), + total_loss, + partitions, + }; + write_json( + self.object_store.as_ref(), + &self.root_dir.child(PARTITION_ARTIFACT_MANIFEST_FILE_NAME), + &manifest, + ) + .await?; + + let mut files = vec![PARTITION_ARTIFACT_MANIFEST_FILE_NAME.to_string()]; + files.extend(artifact_files); + Ok(files) + } + + async fn flush_bucket(&mut self, bucket_id: usize) -> Result<()> { + if self.buffers[bucket_id].is_empty() { + return Ok(()); + } + + let batch = self.take_temp_batch(bucket_id)?; + let writer = self.ensure_temp_writer(bucket_id).await?; + writer.write_batch(&batch).await?; + Ok(()) + } + + fn take_temp_batch(&mut self, bucket_id: usize) -> Result { + let buffer = &mut self.buffers[bucket_id]; + let row_ids = UInt64Array::from(mem::take(&mut buffer.row_ids)); + let part_ids = UInt32Array::from(mem::take(&mut buffer.partition_ids)); + let pq_values = UInt8Array::from(mem::take(&mut buffer.pq_values)); + let pq_codes = + FixedSizeListArray::try_new_from_values(pq_values, self.pq_code_width as i32)?; + RecordBatch::try_new( + self.temp_schema.clone(), + vec![Arc::new(row_ids), Arc::new(part_ids), Arc::new(pq_codes)], + ) + .map_err(Error::from) + } + + async fn ensure_temp_writer(&mut self, bucket_id: usize) -> Result<&mut FileWriter> { + if self.temp_writers[bucket_id].is_none() { + let path = self.temp_bucket_path(bucket_id); + let writer = FileWriter::try_new( + self.object_store.create(&path).await?, + Schema::try_from(self.temp_schema.as_ref())?, + file_writer_options()?, + )?; + self.temp_writers[bucket_id] = Some(writer); + } + Ok(self.temp_writers[bucket_id] + .as_mut() + .expect("temp writer initialized")) + } + + async fn finalize_bucket( + &self, + bucket_id: usize, + partitions: &mut [PartitionArtifactPartition], + ) -> Result> { + let temp_path = self.temp_bucket_path(bucket_id); + if !self.object_store.exists(&temp_path).await? { + return Ok(None); + } + + let reader = FileReader::try_open( + ScanScheduler::new( + self.object_store.clone(), + SchedulerConfig::max_bandwidth(&self.object_store), + ) + .open_file(&temp_path, &CachedFileSize::unknown()) + .await?, + None, + Arc::::default(), + &LanceCache::no_cache(), + FileReaderOptions::default(), + ) + .await?; + + let batches = reader + .read_stream( + ReadBatchParams::RangeFull, + u32::MAX, + 16, + FilterExpression::no_filter(), + )? + .try_collect::>() + .await?; + let total_rows = batches.iter().map(|batch| batch.num_rows()).sum::(); + if total_rows == 0 { + self.object_store.delete(&temp_path).await?; + return Ok(None); + } + + let mut row_ids = Vec::with_capacity(total_rows); + let mut partition_ids = Vec::with_capacity(total_rows); + let mut pq_values = Vec::with_capacity(total_rows * self.pq_code_width); + for batch in batches { + let batch_row_ids = batch[ROW_ID].as_primitive::(); + let batch_partition_ids = + batch[PART_ID_COLUMN].as_primitive::(); + let batch_pq = batch[PQ_CODE_COLUMN].as_fixed_size_list(); + let batch_pq_values = batch_pq + .values() + .as_primitive::(); + row_ids.extend(batch_row_ids.values().iter().copied()); + partition_ids.extend(batch_partition_ids.values().iter().copied()); + pq_values.extend_from_slice(batch_pq_values.values().as_ref()); + } + + let mut permutation = (0..total_rows).collect::>(); + permutation.sort_unstable_by_key(|&idx| partition_ids[idx]); + + let mut sorted_row_ids = Vec::with_capacity(total_rows); + let mut sorted_partition_ids = Vec::with_capacity(total_rows); + let mut sorted_pq_values = Vec::with_capacity(total_rows * self.pq_code_width); + for idx in permutation { + sorted_row_ids.push(row_ids[idx]); + sorted_partition_ids.push(partition_ids[idx]); + let start = idx * self.pq_code_width; + let end = start + self.pq_code_width; + sorted_pq_values.extend_from_slice(&pq_values[start..end]); + } + + let final_path = self.final_bucket_path(bucket_id); + let final_relative_path = self.final_bucket_relative_path(bucket_id); + let mut writer = FileWriter::try_new( + self.object_store.create(&final_path).await?, + Schema::try_from(self.final_schema.as_ref())?, + file_writer_options()?, + )?; + let final_batch = RecordBatch::try_new( + self.final_schema.clone(), + vec![ + Arc::new(UInt64Array::from(sorted_row_ids)), + Arc::new(FixedSizeListArray::try_new_from_values( + UInt8Array::from(sorted_pq_values), + self.pq_code_width as i32, + )?), + ], + )?; + writer.write_batch(&final_batch).await?; + writer.finish().await?; + + let mut offset = 0usize; + while offset < sorted_partition_ids.len() { + let partition_id = sorted_partition_ids[offset] as usize; + let mut end = offset + 1; + while end < sorted_partition_ids.len() + && sorted_partition_ids[end] == sorted_partition_ids[offset] + { + end += 1; + } + partitions[partition_id] = PartitionArtifactPartition { + path: Some(final_relative_path.clone()), + num_rows: end - offset, + ranges: vec![PartitionArtifactRange { + offset: offset as u64, + num_rows: (end - offset) as u64, + }], + }; + offset = end; + } + + self.object_store.delete(&temp_path).await?; + Ok(Some(final_relative_path)) + } + + fn temp_bucket_path(&self, bucket_id: usize) -> Path { + self.root_dir + .child(PARTITION_ARTIFACT_PARTITIONS_DIR) + .child(format!( + "{PARTITION_ARTIFACT_STAGING_PREFIX}{bucket_id:05}.lance" + )) + } + + fn final_bucket_path(&self, bucket_id: usize) -> Path { + self.root_dir + .child(PARTITION_ARTIFACT_PARTITIONS_DIR) + .child(format!( + "{PARTITION_ARTIFACT_BUCKET_PREFIX}{bucket_id:05}.lance" + )) + } + + fn final_bucket_relative_path(&self, bucket_id: usize) -> String { + format!( + "{PARTITION_ARTIFACT_PARTITIONS_DIR}/{PARTITION_ARTIFACT_BUCKET_PREFIX}{bucket_id:05}.lance" + ) + } +} + +#[derive(Debug)] +pub(crate) struct PartitionArtifactShuffleReader { + scheduler: Arc, + root_dir: Path, + partitions: Vec, + total_loss: Option, + file_readers: Mutex>>, +} + +fn file_writer_options() -> Result { + Ok(FileWriterOptions { + format_version: Some( + PARTITION_ARTIFACT_FILE_VERSION + .parse::() + .map_err(|error| { + Error::invalid_input(format!( + "invalid partition artifact file version '{}': {}", + PARTITION_ARTIFACT_FILE_VERSION, error + )) + })?, + ), + ..Default::default() + }) +} + +fn validate_input_batch(batch: &RecordBatch, pq_code_width: usize) -> Result<()> { + let Some(row_ids) = batch.column_by_name(ROW_ID) else { + return Err(Error::invalid_input(format!( + "partition artifact batch must contain {ROW_ID}" + ))); + }; + if row_ids.data_type() != &DataType::UInt64 { + return Err(Error::invalid_input(format!( + "partition artifact batch column {ROW_ID} must be uint64, got {}", + row_ids.data_type() + ))); + } + let Some(part_ids) = batch.column_by_name(PART_ID_COLUMN) else { + return Err(Error::invalid_input(format!( + "partition artifact batch must contain {PART_ID_COLUMN}" + ))); + }; + if part_ids.data_type() != &DataType::UInt32 { + return Err(Error::invalid_input(format!( + "partition artifact batch column {PART_ID_COLUMN} must be uint32, got {}", + part_ids.data_type() + ))); + } + let Some(pq_codes) = batch.column_by_name(PQ_CODE_COLUMN) else { + return Err(Error::invalid_input(format!( + "partition artifact batch must contain {PQ_CODE_COLUMN}" + ))); + }; + match pq_codes.data_type() { + DataType::FixedSizeList(_, width) if *width as usize == pq_code_width => Ok(()), + other => Err(Error::invalid_input(format!( + "partition artifact batch column {PQ_CODE_COLUMN} must be fixed_size_list[{}], got {}", + pq_code_width, other + ))), + } +} + +async fn write_json( + object_store: &ObjectStore, + path: &Path, + value: &T, +) -> Result<()> { + let bytes = serde_json::to_vec(value).map_err(|error| { + Error::invalid_input(format!( + "failed to serialize partition artifact manifest '{}': {}", + path, error + )) + })?; + let mut writer = object_store.create(path).await?; + writer.write_all(&bytes).await?; + Writer::shutdown(writer.as_mut()).await?; + Ok(()) +} + +impl PartitionArtifactShuffleReader { + pub(crate) async fn try_open( + uri: &str, + storage_options: Option<&HashMap>, + ) -> Result { + let registry = Arc::new(ObjectStoreRegistry::default()); + let params = if let Some(storage_options) = storage_options { + ObjectStoreParams { + storage_options_accessor: Some(Arc::new( + lance_io::object_store::StorageOptionsAccessor::with_static_options( + storage_options.clone(), + ), + )), + ..Default::default() + } + } else { + ObjectStoreParams::default() + }; + let (object_store, root_dir) = + ObjectStore::from_uri_and_params(registry, uri, ¶ms).await?; + Self::try_open_with_store(object_store, root_dir).await + } + + async fn try_open_with_store(object_store: Arc, root_dir: Path) -> Result { + let manifest_path = root_dir.child("manifest.json"); + let manifest_bytes = object_store.read_one_all(&manifest_path).await?; + let manifest: PartitionArtifactManifest = + serde_json::from_slice(&manifest_bytes).map_err(|error| { + Error::invalid_input(format!( + "failed to parse partition artifact manifest '{}': {}", + manifest_path, error + )) + })?; + if manifest.version != 1 { + return Err(Error::invalid_input(format!( + "unsupported partition artifact manifest version {}", + manifest.version + ))); + } + if manifest.partitions.len() != manifest.num_partitions { + return Err(Error::invalid_input(format!( + "partition artifact manifest has {} partitions but num_partitions is {}", + manifest.partitions.len(), + manifest.num_partitions + ))); + } + + let scheduler = ScanScheduler::new( + object_store.clone(), + SchedulerConfig::max_bandwidth(&object_store), + ); + Ok(Self { + scheduler, + root_dir, + partitions: manifest.partitions, + total_loss: manifest.total_loss, + file_readers: Mutex::new(HashMap::new()), + }) + } + + async fn open_file_reader(&self, relative_path: &str) -> Result> { + if let Some(reader) = self + .file_readers + .lock() + .expect("partition artifact reader mutex poisoned") + .get(relative_path) + .cloned() + { + return Ok(reader); + } + + let path = join_relative_path(&self.root_dir, relative_path); + let reader = Arc::new( + FileReader::try_open( + self.scheduler + .open_file(&path, &CachedFileSize::unknown()) + .await?, + None, + Arc::::default(), + &LanceCache::no_cache(), + FileReaderOptions::default(), + ) + .await?, + ); + self.file_readers + .lock() + .expect("partition artifact reader mutex poisoned") + .insert(relative_path.to_string(), reader.clone()); + Ok(reader) + } +} + +fn join_relative_path(root_dir: &Path, relative_path: &str) -> Path { + relative_path + .split('/') + .filter(|segment| !segment.is_empty()) + .fold(root_dir.clone(), |path, segment| path.child(segment)) +} + +#[async_trait::async_trait] +impl ShuffleReader for PartitionArtifactShuffleReader { + async fn read_partition( + &self, + partition_id: usize, + ) -> Result>> { + let Some(partition) = self.partitions.get(partition_id) else { + return Ok(None); + }; + if partition.num_rows == 0 { + return Ok(None); + } + let path = partition.path.as_ref().ok_or_else(|| { + Error::invalid_input(format!( + "partition artifact partition {} has {} rows but no path", + partition_id, partition.num_rows + )) + })?; + if partition.ranges.is_empty() { + return Err(Error::invalid_input(format!( + "partition artifact partition {} has {} rows but no ranges", + partition_id, partition.num_rows + ))); + } + + let reader = self.open_file_reader(path).await?; + let ranges = partition + .ranges + .iter() + .map(|range| Range { + start: range.offset, + end: range.offset + range.num_rows, + }) + .collect::>(); + let schema = Arc::new(reader.schema().as_ref().into()); + Ok(Some(Box::new(RecordBatchStreamAdapter::new( + schema, + reader.read_stream( + ReadBatchParams::Ranges(ranges.into()), + u32::MAX, + 16, + FilterExpression::no_filter(), + )?, + )))) + } + + fn partition_size(&self, partition_id: usize) -> Result { + Ok(self + .partitions + .get(partition_id) + .map(|partition| partition.num_rows) + .unwrap_or(0)) + } + + fn total_loss(&self) -> Option { + self.total_loss + } +} + +#[cfg(test)] +mod tests { + use std::fs; + + use arrow_array::cast::AsArray; + use arrow_array::{FixedSizeListArray, RecordBatch, UInt8Array, UInt64Array}; + use futures::TryStreamExt; + use lance_arrow::FixedSizeListArrayExt; + use lance_core::ROW_ID; + use lance_core::datatypes::Schema; + use lance_file::writer::{FileWriter, FileWriterOptions}; + use lance_io::object_store::ObjectStore; + + use crate::Error; + + use super::*; + + #[tokio::test] + async fn partition_artifact_builder_compacts_runs_into_single_partition_range() { + let tempdir = tempfile::tempdir().unwrap(); + let root_dir = tempdir.path().join("artifact"); + fs::create_dir_all(&root_dir).unwrap(); + let object_store = Arc::new(ObjectStore::local()); + let root_path = Path::from_filesystem_path(&root_dir).unwrap(); + + let mut builder = PartitionArtifactBuilder::try_new_with_store( + object_store.clone(), + root_path.clone(), + 300, + 2, + ) + .unwrap(); + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new(ROW_ID, DataType::UInt64, false), + Field::new(PART_ID_COLUMN, DataType::UInt32, false), + Field::new( + PQ_CODE_COLUMN, + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::UInt8, true)), 2), + true, + ), + ])); + + let batch1 = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(UInt64Array::from(vec![10_u64, 11, 12, 13])), + Arc::new(UInt32Array::from(vec![0_u32, 256, 0, 256])), + Arc::new( + FixedSizeListArray::try_new_from_values( + UInt8Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8]), + 2, + ) + .unwrap(), + ), + ], + ) + .unwrap(); + let batch2 = RecordBatch::try_new( + schema, + vec![ + Arc::new(UInt64Array::from(vec![14_u64, 15])), + Arc::new(UInt32Array::from(vec![1_u32, 256])), + Arc::new( + FixedSizeListArray::try_new_from_values( + UInt8Array::from(vec![9, 10, 11, 12]), + 2, + ) + .unwrap(), + ), + ], + ) + .unwrap(); + builder.append_batch(&batch1).await.unwrap(); + builder.append_batch(&batch2).await.unwrap(); + let artifact_files = builder.finish("metadata.lance", Some(2.5)).await.unwrap(); + assert_eq!(artifact_files[0], "manifest.json"); + assert!( + artifact_files + .iter() + .any(|path| path.ends_with("bucket-00000.lance")) + ); + + let manifest: PartitionArtifactManifest = + serde_json::from_slice(&fs::read(root_dir.join("manifest.json")).unwrap()).unwrap(); + assert_eq!(manifest.version, 1); + assert_eq!(manifest.metadata_file.as_deref(), Some("metadata.lance")); + assert_eq!(manifest.total_loss, Some(2.5)); + assert_eq!(manifest.partitions[0].num_rows, 2); + assert_eq!(manifest.partitions[0].ranges.len(), 1); + assert_eq!(manifest.partitions[1].num_rows, 1); + assert_eq!(manifest.partitions[1].ranges.len(), 1); + assert_eq!(manifest.partitions[256].num_rows, 3); + assert_eq!(manifest.partitions[256].ranges.len(), 1); + assert_eq!( + manifest.partitions[0].path, manifest.partitions[256].path, + "partitions sharing a bucket should share one final file" + ); + + let reader = PartitionArtifactShuffleReader::try_open_with_store(object_store, root_path) + .await + .unwrap(); + let partition_0 = reader + .read_partition(0) + .await + .unwrap() + .unwrap() + .try_collect::>() + .await + .unwrap(); + let partition_0_row_ids = partition_0 + .iter() + .flat_map(|batch| { + batch[ROW_ID] + .as_primitive::() + .values() + .iter() + .copied() + }) + .collect::>(); + assert_eq!(partition_0_row_ids, vec![10, 12]); + + let partition_256 = reader + .read_partition(256) + .await + .unwrap() + .unwrap() + .try_collect::>() + .await + .unwrap(); + let partition_256_row_ids = partition_256 + .iter() + .flat_map(|batch| { + batch[ROW_ID] + .as_primitive::() + .values() + .iter() + .copied() + }) + .collect::>(); + assert_eq!(partition_256_row_ids, vec![11, 13, 15]); + } + + #[tokio::test] + async fn partition_artifact_reader_reads_partition_ranges() { + let tempdir = tempfile::tempdir().unwrap(); + let root_dir = tempdir.path().join("artifact"); + fs::create_dir_all(root_dir.join("partitions")).unwrap(); + + let object_store = Arc::new(ObjectStore::local()); + let root_path = Path::from_filesystem_path(&root_dir).unwrap(); + let partition_path = root_path.child("partitions").child("bucket-00000.lance"); + let schema = Arc::new(arrow_schema::Schema::new(vec![ + arrow_schema::Field::new(ROW_ID, arrow_schema::DataType::UInt64, false), + arrow_schema::Field::new( + lance_index::vector::PQ_CODE_COLUMN, + arrow_schema::DataType::FixedSizeList( + Arc::new(arrow_schema::Field::new( + "item", + arrow_schema::DataType::UInt8, + true, + )), + 2, + ), + true, + ), + ])); + let mut writer = FileWriter::try_new( + object_store.create(&partition_path).await.unwrap(), + Schema::try_from(schema.as_ref()).unwrap(), + FileWriterOptions::default(), + ) + .unwrap(); + let batch1 = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(UInt64Array::from(vec![10_u64, 11, 12])), + Arc::new( + FixedSizeListArray::try_new_from_values( + UInt8Array::from(vec![1, 2, 3, 4, 5, 6]), + 2, + ) + .unwrap(), + ), + ], + ) + .unwrap(); + let batch2 = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(UInt64Array::from(vec![13_u64, 14])), + Arc::new( + FixedSizeListArray::try_new_from_values(UInt8Array::from(vec![7, 8, 9, 10]), 2) + .unwrap(), + ), + ], + ) + .unwrap(); + writer.write_batch(&batch1).await.unwrap(); + writer.write_batch(&batch2).await.unwrap(); + writer.finish().await.unwrap(); + + let manifest = serde_json::json!({ + "version": 1, + "num_partitions": 3, + "total_loss": 1.5, + "partitions": [ + { + "path": "partitions/bucket-00000.lance", + "num_rows": 2, + "ranges": [ + {"offset": 0, "num_rows": 1}, + {"offset": 3, "num_rows": 1}, + ], + }, + { + "path": "partitions/bucket-00000.lance", + "num_rows": 2, + "ranges": [ + {"offset": 1, "num_rows": 2}, + ], + }, + { + "num_rows": 0, + "ranges": [], + }, + ], + }); + fs::write( + root_dir.join("manifest.json"), + serde_json::to_vec(&manifest).unwrap(), + ) + .unwrap(); + + let reader = PartitionArtifactShuffleReader::try_open_with_store(object_store, root_path) + .await + .unwrap(); + assert_eq!(reader.partition_size(0).unwrap(), 2); + assert_eq!(reader.partition_size(1).unwrap(), 2); + assert_eq!(reader.partition_size(2).unwrap(), 0); + assert_eq!(reader.total_loss(), Some(1.5)); + + let stream = reader.read_partition(0).await.unwrap().unwrap(); + let batches = stream.try_collect::>().await.unwrap(); + let row_ids = batches + .iter() + .flat_map(|batch| { + batch[ROW_ID] + .as_primitive::() + .values() + .iter() + .copied() + }) + .collect::>(); + assert_eq!(row_ids, vec![10, 13]); + assert!(reader.read_partition(2).await.unwrap().is_none()); + } + + #[tokio::test] + async fn partition_artifact_reader_rejects_missing_partition_entry() { + let tempdir = tempfile::tempdir().unwrap(); + let root_dir = tempdir.path().join("artifact"); + fs::create_dir_all(&root_dir).unwrap(); + let manifest = serde_json::json!({ + "version": 1, + "num_partitions": 2, + "partitions": [{"num_rows": 0, "ranges": []}], + }); + fs::write( + root_dir.join("manifest.json"), + serde_json::to_vec(&manifest).unwrap(), + ) + .unwrap(); + + let error = PartitionArtifactShuffleReader::try_open_with_store( + Arc::new(ObjectStore::local()), + Path::from_filesystem_path(&root_dir).unwrap(), + ) + .await + .unwrap_err(); + assert!(matches!(error, Error::InvalidInput { .. })); + } +} diff --git a/rust/lance/src/index/vector/utils.rs b/rust/lance/src/index/vector/utils.rs index 19156ac8eed..244a02c39bc 100644 --- a/rust/lance/src/index/vector/utils.rs +++ b/rust/lance/src/index/vector/utils.rs @@ -372,7 +372,7 @@ impl PartitionLoadLock { /// /// Handles both regular vector columns (FixedSizeList) and multivector columns /// (List\), flattening the latter. -fn vector_column_to_fsl(batch: &RecordBatch, column: &str) -> Result { +pub fn vector_column_to_fsl(batch: &RecordBatch, column: &str) -> Result { let array = get_column_from_batch(batch, column)?; match array.data_type() { arrow::datatypes::DataType::FixedSizeList(_, _) => Ok(array.as_fixed_size_list().clone()), From c0af4918b16cd5584e305cc0b90b74a723d7d9bf Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Tue, 7 Apr 2026 17:20:13 +0800 Subject: [PATCH 15/21] refactor: decouple cuvs backend from main tree --- Cargo.toml | 2 +- python/Cargo.lock | 112 +- python/Cargo.toml | 2 - python/python/lance/cuvs.py | 93 +- python/python/lance/dataset.py | 39 +- python/python/tests/test_vector_index.py | 165 +-- python/src/indices.rs | 91 -- rust/lance-cuvs/Cargo.toml | 28 - rust/lance-cuvs/src/lib.rs | 1237 ---------------------- 9 files changed, 67 insertions(+), 1702 deletions(-) delete mode 100644 rust/lance-cuvs/Cargo.toml delete mode 100644 rust/lance-cuvs/src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index bddb49ed4a3..c922eff6b8b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ members = [ "rust/compression/bitpacking", "rust/arrow-scalar", ] -exclude = ["python", "java/lance-jni", "rust/lance-cuvs"] +exclude = ["python", "java/lance-jni"] # Python package needs to be built by maturin. resolver = "3" diff --git a/python/Cargo.lock b/python/Cargo.lock index aa4cfb72154..4507a617872 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -14,7 +14,7 @@ dependencies = [ "core_extensions", "crossbeam-channel", "generational-arena", - "libloading 0.7.4", + "libloading", "lock_api", "parking_lot", "paste", @@ -1070,26 +1070,6 @@ dependencies = [ "virtue", ] -[[package]] -name = "bindgen" -version = "0.72.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" -dependencies = [ - "bitflags 2.11.0", - "cexpr", - "clang-sys", - "itertools 0.13.0", - "log", - "prettyplease", - "proc-macro2", - "quote", - "regex", - "rustc-hash", - "shlex", - "syn 2.0.117", -] - [[package]] name = "bitflags" version = "1.3.2" @@ -1289,15 +1269,6 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4f4c707c6a209cbe82d10abd08e1ea8995e9ea937d2550646e02798948992be0" -[[package]] -name = "cexpr" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" -dependencies = [ - "nom 7.1.3", -] - [[package]] name = "cfg-if" version = "1.0.4" @@ -1344,17 +1315,6 @@ dependencies = [ "inout", ] -[[package]] -name = "clang-sys" -version = "1.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" -dependencies = [ - "glob", - "libc", - "libloading 0.8.9", -] - [[package]] name = "cmake" version = "0.1.58" @@ -1617,26 +1577,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "cuvs" -version = "26.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9778fa1e16f42539772496e9adba2a29c67dca84bcb0d247795f9cb3135ba87d" -dependencies = [ - "cuvs-sys", - "ndarray 0.15.6", -] - -[[package]] -name = "cuvs-sys" -version = "26.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4cad121da7a7ac908965352ffeac029a93fb0e3a1278a271f7204098b8724e9" -dependencies = [ - "bindgen", - "cmake", -] - [[package]] name = "darling" version = "0.20.11" @@ -4045,30 +3985,6 @@ dependencies = [ "url", ] -[[package]] -name = "lance-cuvs" -version = "5.0.0-beta.2" -dependencies = [ - "arrow", - "arrow-array", - "arrow-buffer", - "arrow-schema", - "cuvs", - "cuvs-sys", - "futures", - "half", - "lance", - "lance-arrow", - "lance-core", - "lance-file", - "lance-index", - "lance-io", - "lance-linalg", - "log", - "ndarray 0.16.1", - "tokio", -] - [[package]] name = "lance-datafusion" version = "5.0.0-beta.2" @@ -4251,7 +4167,7 @@ dependencies = [ "lindera", "lindera-tantivy", "log", - "ndarray 0.16.1", + "ndarray", "num-traits", "object_store", "prost", @@ -4552,16 +4468,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "libloading" -version = "0.8.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" -dependencies = [ - "cfg-if", - "windows-link", -] - [[package]] name = "liblzma" version = "0.4.6" @@ -4988,19 +4894,6 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b" -[[package]] -name = "ndarray" -version = "0.15.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32" -dependencies = [ - "matrixmultiply", - "num-complex", - "num-integer", - "num-traits", - "rawpointer", -] - [[package]] name = "ndarray" version = "0.16.1" @@ -5796,7 +5689,6 @@ dependencies = [ "lance", "lance-arrow", "lance-core", - "lance-cuvs", "lance-datafusion", "lance-datagen", "lance-encoding", diff --git a/python/Cargo.toml b/python/Cargo.toml index d63a8e113d4..a3542f7360f 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -35,7 +35,6 @@ lance = { path = "../rust/lance", features = [ ] } lance-arrow = { path = "../rust/lance-arrow" } lance-core = { path = "../rust/lance-core" } -lance-cuvs = { path = "../rust/lance-cuvs", optional = true } lance-datagen = { path = "../rust/lance-datagen", optional = true } lance-encoding = { path = "../rust/lance-encoding" } lance-file = { path = "../rust/lance-file" } @@ -76,7 +75,6 @@ bytes = "1.4" [features] default = [] -cuvs = ["dep:lance-cuvs"] datagen = ["lance-datagen"] fp16kernels = ["lance/fp16kernels"] diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py index b36fd18c564..c32dc12b55c 100644 --- a/python/python/lance/cuvs.py +++ b/python/python/lance/cuvs.py @@ -38,28 +38,49 @@ DEFAULT_PARTITION_ARTIFACT_BUCKETS = 256 PARTITION_ARTIFACT_ROW_ID_COLUMN = "_rowid" -try: - from . import lance as _lance_ext +def build_vector_index_on_cuvs( + dataset, + column: str, + metric_type: str, + accelerator: str, + num_partitions: int, + num_sub_vectors: int, + dst_dataset_uri: str | Path | None = None, + *, + sample_rate: int = 256, + max_iters: int = 50, + num_bits: int = 8, + batch_size: int = 1024 * 128, + filter_nan: bool = True, +): + if dst_dataset_uri is None: + dst_dataset_uri = tempfile.mkdtemp() - _assign_ivf_pq_on_cuvs_rust_impl = getattr( - _lance_ext.indices, "_assign_ivf_pq_on_cuvs_rust" - ) - _train_ivf_pq_on_cuvs_rust_impl = getattr( - _lance_ext.indices, "_train_ivf_pq_on_cuvs_rust" + trained_index, ivf_centroids, pq_codebook = _train_ivf_pq_index_on_cuvs( + dataset, + column, + num_partitions, + metric_type, + accelerator, + num_sub_vectors=num_sub_vectors, + sample_rate=sample_rate, + max_iters=max_iters, + num_bits=num_bits, + filter_nan=filter_nan, ) -except (ImportError, AttributeError): - _assign_ivf_pq_on_cuvs_rust_impl = None - _train_ivf_pq_on_cuvs_rust_impl = None - - -def _has_rust_cuvs_backend() -> bool: - return ( - _train_ivf_pq_on_cuvs_rust_impl is not None - and _assign_ivf_pq_on_cuvs_rust_impl is not None + artifact_root, artifact_files = one_pass_assign_ivf_pq_on_cuvs( + dataset, + column, + metric_type, + accelerator, + ivf_centroids, + pq_codebook, + trained_index=trained_index, + dst_dataset_uri=dst_dataset_uri, + batch_size=batch_size, + filter_nan=filter_nan, ) - -def _unwrap_dataset(dataset): - return getattr(dataset, "_ds", dataset) + return artifact_root, artifact_files, ivf_centroids, pq_codebook def is_cuvs_accelerator(accelerator: object) -> bool: @@ -374,19 +395,6 @@ def _train_ivf_pq_index_on_cuvs( num_bits: int = 8, filter_nan: bool = True, ): - if _has_rust_cuvs_backend(): - return _train_ivf_pq_on_cuvs_rust_impl( - _unwrap_dataset(dataset), - column, - num_partitions, - metric_type, - num_sub_vectors, - sample_rate=sample_rate, - max_iters=max_iters, - num_bits=num_bits, - filter_nan=filter_nan, - ) - if accelerator != "cuvs": raise ValueError("cuVS acceleration only supports accelerator='cuvs'") if num_bits != 8: @@ -446,27 +454,6 @@ def one_pass_assign_ivf_pq_on_cuvs( *, filter_nan: bool = True, ): - if _has_rust_cuvs_backend(): - if accelerator != "cuvs": - raise ValueError("cuVS acceleration only supports accelerator='cuvs'") - if trained_index is None: - raise ValueError( - "one_pass_assign_ivf_pq_on_cuvs requires a trained cuVS index for " - "single-node transform" - ) - if dst_dataset_uri is None: - dst_dataset_uri = tempfile.mkdtemp() - artifact_files = _assign_ivf_pq_on_cuvs_rust_impl( - _unwrap_dataset(dataset), - column, - trained_index, - str(dst_dataset_uri), - batch_size=batch_size, - filter_nan=filter_nan, - ) - LOGGER.info("Saved precomputed partition artifact to %s", dst_dataset_uri) - return str(dst_dataset_uri), artifact_files - if accelerator != "cuvs": raise ValueError("cuVS acceleration only supports accelerator='cuvs'") diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 415ffdb8865..ee023048a1e 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -2960,48 +2960,29 @@ def _create_index_impl( ) if use_cuvs: - from .cuvs import ( - _train_ivf_pq_index_on_cuvs, - one_pass_assign_ivf_pq_on_cuvs, - ) + from .cuvs import build_vector_index_on_cuvs - LOGGER.info("Doing one-pass ivfpq cuVS training") - timers["ivf+pq_train:start"] = time.time() - trained_index, ivf_centroids, pq_codebook = _train_ivf_pq_index_on_cuvs( + LOGGER.info("Doing cuVS vector backend build") + timers["ivf+pq_build:start"] = time.time() + artifact_root, _, ivf_centroids, pq_codebook = build_vector_index_on_cuvs( self, column[0], - num_partitions, metric, accelerator, - num_sub_vectors=num_sub_vectors, + num_partitions, + num_sub_vectors, sample_rate=kwargs.get("sample_rate", 256), max_iters=kwargs.get("max_iters", 50), num_bits=kwargs.get("num_bits", 8), - filter_nan=filter_nan, - ) - timers["ivf+pq_train:end"] = time.time() - ivfpq_train_time = ( - timers["ivf+pq_train:end"] - timers["ivf+pq_train:start"] - ) - LOGGER.info("cuVS ivf+pq training time: %ss", ivfpq_train_time) - timers["ivf+pq_assign:start"] = time.time() - artifact_root, _ = one_pass_assign_ivf_pq_on_cuvs( - self, - column[0], - metric, - accelerator, - ivf_centroids, - pq_codebook, - trained_index=trained_index, batch_size=1024 * 128, filter_nan=filter_nan, ) kwargs["precomputed_partition_artifact_uri"] = artifact_root - timers["ivf+pq_assign:end"] = time.time() - ivfpq_assign_time = ( - timers["ivf+pq_assign:end"] - timers["ivf+pq_assign:start"] + timers["ivf+pq_build:end"] = time.time() + ivfpq_build_time = ( + timers["ivf+pq_build:end"] - timers["ivf+pq_build:start"] ) - LOGGER.info("cuVS ivf+pq transform time: %ss", ivfpq_assign_time) + LOGGER.info("cuVS ivf+pq build time: %ss", ivfpq_build_time) else: from .vector import ( one_pass_assign_ivf_pq_on_accelerator, diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index c2b42de2ac1..9aba519fa9c 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -30,8 +30,7 @@ def _disable_rust_cuvs_backend(monkeypatch): - monkeypatch.setattr(lance_cuvs, "_train_ivf_pq_on_cuvs_rust_impl", None) - monkeypatch.setattr(lance_cuvs, "_assign_ivf_pq_on_cuvs_rust_impl", None) + del monkeypatch def create_table(nvec=1000, ndim=128, nans=0, nullify=False, dtype=np.float32): @@ -549,22 +548,14 @@ def test_create_index_cuvs_dispatch(tmp_path, monkeypatch): dataset = lance.write_dataset(tbl, tmp_path) calls = {} - class FakeIndex: - pq_dim = 16 - pq_bits = 8 - - def fake_train( + def fake_build( dataset_arg, column, - num_partitions, metric_type, accelerator, + num_partitions, num_sub_vectors, - *, - sample_rate, - max_iters, - num_bits, - filter_nan, + **kwargs, ): calls["dataset"] = dataset_arg calls["column"] = column @@ -572,52 +563,16 @@ def fake_train( calls["metric_type"] = metric_type calls["accelerator"] = accelerator calls["num_sub_vectors"] = num_sub_vectors - calls["sample_rate"] = sample_rate - calls["max_iters"] = max_iters - calls["num_bits"] = num_bits - calls["filter_nan"] = filter_nan - return ( - FakeIndex(), - np.random.randn(num_partitions, 128).astype(np.float32), - np.random.randn(num_sub_vectors, 256, 128 // num_sub_vectors).astype( - np.float32 - ), - ) - - def fake_assign( - dataset_arg, - column, - metric_type, - accelerator, - ivf_centroids, - pq_codebook, - trained_index, - dst_path=None, - batch_size=20480, - *, - filter_nan, - ): - calls["assign_dataset"] = dataset_arg - calls["assign_column"] = column - calls["assign_metric_type"] = metric_type - calls["assign_accelerator"] = accelerator - calls["assign_ivf_centroids"] = ivf_centroids - calls["assign_pq_codebook"] = pq_codebook - calls["assign_trained_index"] = trained_index - calls["assign_batch_size"] = batch_size - calls["assign_filter_nan"] = filter_nan + calls["kwargs"] = kwargs return str(tmp_path / "cuvs_artifact"), [ "manifest.json", "metadata.lance", "partitions/bucket-00000.lance", - ] + ], np.random.randn(num_partitions, 128).astype(np.float32), np.random.randn( + num_sub_vectors, 256, 128 // num_sub_vectors + ).astype(np.float32) - monkeypatch.setattr(lance_cuvs, "_train_ivf_pq_index_on_cuvs", fake_train) - monkeypatch.setattr( - lance_cuvs, - "one_pass_assign_ivf_pq_on_cuvs", - fake_assign, - ) + monkeypatch.setattr(lance_cuvs, "build_vector_index_on_cuvs", fake_build) dataset = dataset.create_index( "vector", @@ -632,10 +587,11 @@ def fake_assign( assert calls["metric_type"] == "L2" assert calls["accelerator"] == "cuvs" assert calls["num_sub_vectors"] == 16 - assert calls["assign_column"] == "vector" - assert calls["assign_metric_type"] == "L2" - assert calls["assign_accelerator"] == "cuvs" - assert isinstance(calls["assign_trained_index"], FakeIndex) + assert calls["kwargs"]["sample_rate"] == 256 + assert calls["kwargs"]["max_iters"] == 50 + assert calls["kwargs"]["num_bits"] == 8 + assert calls["kwargs"]["batch_size"] == 1024 * 128 + assert calls["kwargs"]["filter_nan"] is True assert dataset.stats.index_stats("vector_idx")["index_type"] == "IVF_PQ" @@ -741,55 +697,6 @@ def build(build_params, matrix): assert pq_codebook.shape == (4, 256, 4) -def test_train_ivf_pq_on_cuvs_prefers_rust_backend(tmp_path, monkeypatch): - dataset = lance.write_dataset(create_table(nvec=32, ndim=16), tmp_path) - calls = {} - - class FakeRustIndex: - pass - - def fake_train(*args, **kwargs): - calls["args"] = args - calls["kwargs"] = kwargs - return ( - FakeRustIndex(), - pa.FixedSizeListArray.from_arrays( - pa.array(np.arange(64, dtype=np.float32)), 16 - ), - pa.FixedSizeListArray.from_arrays( - pa.array(np.arange(4 * 256 * 4, dtype=np.float32)), 4 - ), - ) - - monkeypatch.setattr(lance_cuvs, "_train_ivf_pq_on_cuvs_rust_impl", fake_train) - monkeypatch.setattr(lance_cuvs, "_assign_ivf_pq_on_cuvs_rust_impl", object()) - monkeypatch.setattr( - lance_cuvs, - "_require_cuvs", - lambda: (_ for _ in ()).throw(AssertionError("python cuVS backend should not run")), - ) - - trained_index, centroids, pq_codebook = lance_cuvs._train_ivf_pq_index_on_cuvs( - dataset, - "vector", - 4, - "l2", - "cuvs", - 4, - sample_rate=8, - max_iters=30, - num_bits=8, - filter_nan=True, - ) - - assert isinstance(trained_index, FakeRustIndex) - assert calls["args"][:5] == (dataset, "vector", 4, "l2", 4) - assert calls["kwargs"]["sample_rate"] == 8 - assert calls["kwargs"]["max_iters"] == 30 - assert isinstance(centroids, pa.FixedSizeListArray) - assert isinstance(pq_codebook, pa.FixedSizeListArray) - - def test_train_ivf_pq_on_cuvs_uses_num_sub_vectors_for_pq_dim( tmp_path, monkeypatch ): @@ -958,50 +865,6 @@ def transform(index, vectors): assert bucket_table.column("__pq_code").type == pa.list_(pa.uint8(), 4) -def test_one_pass_assign_ivf_pq_on_cuvs_prefers_rust_backend(tmp_path, monkeypatch): - dataset = lance.write_dataset(create_table(nvec=32, ndim=16), tmp_path / "cuvs_assign_rust") - calls = {} - - class FakeRustIndex: - pass - - def fake_assign(*args, **kwargs): - calls["args"] = args - calls["kwargs"] = kwargs - return ["manifest.json", "metadata.lance", "partitions/bucket-00000.lance"] - - monkeypatch.setattr(lance_cuvs, "_train_ivf_pq_on_cuvs_rust_impl", object()) - monkeypatch.setattr(lance_cuvs, "_assign_ivf_pq_on_cuvs_rust_impl", fake_assign) - monkeypatch.setattr( - lance_cuvs, - "_require_cuvs", - lambda: (_ for _ in ()).throw(AssertionError("python cuVS backend should not run")), - ) - - artifact_root, artifact_files = lance_cuvs.one_pass_assign_ivf_pq_on_cuvs( - dataset, - "vector", - "l2", - "cuvs", - np.random.randn(4, 16).astype(np.float32), - np.random.randn(4, 256, 4).astype(np.float32), - trained_index=FakeRustIndex(), - dst_dataset_uri=tmp_path / "artifact", - batch_size=4096, - ) - - assert artifact_root == str(tmp_path / "artifact") - assert artifact_files[0] == "manifest.json" - assert calls["args"][:4] == ( - dataset, - "vector", - calls["args"][2], - str(tmp_path / "artifact"), - ) - assert isinstance(calls["args"][2], FakeRustIndex) - assert calls["kwargs"]["batch_size"] == 4096 - - def test_one_pass_assign_ivf_pq_on_cuvs_rejects_incompatible_transform_width( tmp_path, monkeypatch, diff --git a/python/src/indices.rs b/python/src/indices.rs index cb8288b51a5..cea7f2a968a 100644 --- a/python/src/indices.rs +++ b/python/src/indices.rs @@ -32,8 +32,6 @@ use pyo3::{ }; use lance::index::DatasetIndexInternalExt; -#[cfg(feature = "cuvs")] -use lance_cuvs::TrainedIvfPqIndex; use crate::fragment::FileFragment; use crate::utils::{PyJson, PyLance}; @@ -157,89 +155,6 @@ impl PyIvfModel { } } -#[cfg(feature = "cuvs")] -#[pyclass(name = "_CuvsIvfPqIndex", module = "lance.indices", unsendable)] -pub struct PyCuvsIvfPqIndex { - inner: TrainedIvfPqIndex, -} - -#[cfg(feature = "cuvs")] -#[pyfunction] -#[allow(clippy::too_many_arguments)] -#[pyo3( - signature=( - dataset, - column, - num_partitions, - distance_type, - num_sub_vectors, - sample_rate=256, - max_iters=50, - num_bits=8, - filter_nan=true - ) -)] -fn _train_ivf_pq_on_cuvs_rust<'py>( - py: Python<'py>, - dataset: &Dataset, - column: &str, - num_partitions: u32, - distance_type: &str, - num_sub_vectors: u32, - sample_rate: u32, - max_iters: u32, - num_bits: u8, - filter_nan: bool, -) -> PyResult<(Py, Bound<'py, PyAny>, Bound<'py, PyAny>)> { - let distance_type = DistanceType::try_from(distance_type).unwrap(); - let trained = rt() - .runtime - .block_on(lance_cuvs::train_ivf_pq( - dataset.ds.as_ref(), - column, - num_partitions as usize, - distance_type, - num_sub_vectors as usize, - sample_rate as usize, - max_iters as usize, - num_bits as usize, - filter_nan, - )) - .infer_error()?; - let ivf_centroids = trained.ivf_centroids().clone().into_data().to_pyarrow(py)?; - let pq_codebook = trained.pq_codebook().clone().into_data().to_pyarrow(py)?; - Ok(( - Py::new(py, PyCuvsIvfPqIndex { inner: trained })?, - ivf_centroids, - pq_codebook, - )) -} - -#[cfg(feature = "cuvs")] -#[pyfunction] -#[pyo3(signature=(dataset, column, trained_index, artifact_root, batch_size=1024 * 128, filter_nan=true))] -fn _assign_ivf_pq_on_cuvs_rust( - py: Python<'_>, - dataset: &Dataset, - column: &str, - trained_index: &PyCuvsIvfPqIndex, - artifact_root: &str, - batch_size: usize, - filter_nan: bool, -) -> PyResult> { - let _ = py; - rt().runtime - .block_on(lance_cuvs::assign_ivf_pq_to_artifact( - dataset.ds.as_ref(), - column, - &trained_index.inner, - artifact_root, - batch_size, - filter_nan, - )) - .infer_error() -} - /// Internal helper to fetch an IVF model for the given index name. async fn do_get_ivf_model(dataset: &Dataset, index_name: &str) -> PyResult { use lance_index::metrics::NoOpMetricsCollector; @@ -801,12 +716,6 @@ pub fn register_indices(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { indices.add_class::()?; indices.add_class::()?; indices.add_wrapped(wrap_pyfunction!(get_ivf_model))?; - #[cfg(feature = "cuvs")] - { - indices.add_class::()?; - indices.add_wrapped(wrap_pyfunction!(_train_ivf_pq_on_cuvs_rust))?; - indices.add_wrapped(wrap_pyfunction!(_assign_ivf_pq_on_cuvs_rust))?; - } m.add_submodule(&indices)?; Ok(()) } diff --git a/rust/lance-cuvs/Cargo.toml b/rust/lance-cuvs/Cargo.toml deleted file mode 100644 index a001f82c16f..00000000000 --- a/rust/lance-cuvs/Cargo.toml +++ /dev/null @@ -1,28 +0,0 @@ -[package] -name = "lance-cuvs" -version = "5.0.0-beta.2" -edition = "2024" -authors = ["Lance Devs "] -license = "Apache-2.0" -rust-version = "1.91" -publish = false - -[dependencies] -arrow = "57.0.0" -arrow-array = "57.0.0" -arrow-buffer = "57.0.0" -arrow-schema = "57.0.0" -cuvs = "26.2.0" -cuvs-sys = "26.2.0" -futures = "0.3" -half = { version = "2.5", default-features = false, features = ["num-traits", "std"] } -lance = { path = "../lance" } -lance-arrow = { path = "../lance-arrow" } -lance-core = { path = "../lance-core" } -lance-file = { path = "../lance-file" } -lance-index = { path = "../lance-index" } -lance-io = { path = "../lance-io" } -lance-linalg = { path = "../lance-linalg" } -log = "0.4" -ndarray = { version = "0.16.1", features = ["matrixmultiply-threading"] } -tokio = { version = "1.48", features = ["rt-multi-thread"] } diff --git a/rust/lance-cuvs/src/lib.rs b/rust/lance-cuvs/src/lib.rs deleted file mode 100644 index db54ce47f22..00000000000 --- a/rust/lance-cuvs/src/lib.rs +++ /dev/null @@ -1,1237 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright The Lance Authors - -use std::ffi::{CStr, c_void}; -use std::marker::PhantomData; -use std::ptr; -use std::sync::Arc; -use arrow::compute::filter; -use arrow_array::cast::AsArray; -use arrow_array::types::{Float16Type, Float32Type, Float64Type, UInt8Type}; -use arrow_array::{ - Array, FixedSizeListArray, Float32Array, ListArray, RecordBatch, UInt8Array, UInt32Array, - UInt64Array, -}; -use arrow_buffer::{OffsetBuffer, ScalarBuffer}; -use arrow_schema::{DataType, Field, Schema as ArrowSchema}; -use cuvs::Resources; -use futures::TryStreamExt; -use lance::dataset::Dataset; -use lance::index::vector::PartitionArtifactBuilder; -use lance::index::vector::utils::{infer_vector_dim, vector_column_to_fsl}; -use lance_arrow::{FixedSizeListArrayExt, RecordBatchExt}; -use lance_core::{Error, ROW_ID, Result}; -use lance_file::version::LanceFileVersion; -use lance_file::writer::{FileWriter, FileWriterOptions}; -use lance_index::vector::utils::is_finite; -use lance_index::vector::{LOSS_METADATA_KEY, PART_ID_COLUMN, PQ_CODE_COLUMN}; -use lance_linalg::distance::DistanceType; -use log::warn; -use ndarray::{Array2, ArrayView2}; - -const PARTITION_ARTIFACT_METADATA_FILE_NAME: &str = "metadata.lance"; -const PARTITION_ARTIFACT_FILE_VERSION: &str = "2.2"; -const PIPELINE_SLOTS: usize = 2; - -type CudaEventHandle = *mut c_void; - -#[link(name = "cudart")] -unsafe extern "C" { - fn cudaMallocHost(ptr: *mut *mut c_void, size: usize) -> cuvs_sys::cudaError_t; - fn cudaFreeHost(ptr: *mut c_void) -> cuvs_sys::cudaError_t; - fn cudaEventCreate(event: *mut CudaEventHandle) -> cuvs_sys::cudaError_t; - fn cudaEventDestroy(event: CudaEventHandle) -> cuvs_sys::cudaError_t; - fn cudaEventRecord( - event: CudaEventHandle, - stream: cuvs_sys::cudaStream_t, - ) -> cuvs_sys::cudaError_t; - fn cudaEventSynchronize(event: CudaEventHandle) -> cuvs_sys::cudaError_t; -} - -pub struct TrainedIvfPqIndex { - resources: Resources, - index: CuvsIvfPqIndex, - num_partitions: usize, - dimension: usize, - num_sub_vectors: usize, - num_bits: usize, - metric_type: DistanceType, - ivf_centroids: FixedSizeListArray, - pq_codebook: FixedSizeListArray, -} - -impl TrainedIvfPqIndex { - pub fn ivf_centroids(&self) -> &FixedSizeListArray { - &self.ivf_centroids - } - - pub fn pq_codebook(&self) -> &FixedSizeListArray { - &self.pq_codebook - } - - pub fn num_partitions(&self) -> usize { - self.num_partitions - } - - pub fn pq_code_width(&self) -> usize { - self.num_sub_vectors - } - - pub fn metric_type(&self) -> DistanceType { - self.metric_type - } - - pub fn num_bits(&self) -> usize { - self.num_bits - } -} - -struct CuvsIvfPqIndex { - raw: cuvs_sys::cuvsIvfPqIndex_t, -} - -impl CuvsIvfPqIndex { - fn try_new() -> Result { - let mut raw = ptr::null_mut(); - check_cuvs( - unsafe { cuvs_sys::cuvsIvfPqIndexCreate(&mut raw) }, - "create IVF_PQ index", - )?; - Ok(Self { raw }) - } -} - -impl Drop for CuvsIvfPqIndex { - fn drop(&mut self) { - if !self.raw.is_null() { - let _ = unsafe { cuvs_sys::cuvsIvfPqIndexDestroy(self.raw) }; - } - } -} - -enum MatrixBuffer<'a> { - Borrowed { - values: &'a [f32], - rows: usize, - cols: usize, - }, - Owned(Array2), -} - -impl MatrixBuffer<'_> { - fn view(&self) -> Result> { - match self { - Self::Borrowed { values, rows, cols } => ArrayView2::from_shape((*rows, *cols), values) - .map_err(|error| { - Error::io(format!("failed to create borrowed matrix view: {error}")) - }), - Self::Owned(array) => Ok(array.view()), - } - } - - fn rows(&self) -> usize { - match self { - Self::Borrowed { rows, .. } => *rows, - Self::Owned(array) => array.nrows(), - } - } -} - -struct HostTensorView { - shape: Vec, - tensor: cuvs_sys::DLManagedTensor, -} - -impl HostTensorView { - fn try_new(shape: &[usize], data: *mut std::ffi::c_void) -> Self { - let shape = shape.iter().map(|dim| *dim as i64).collect::>(); - let tensor = cuvs_sys::DLManagedTensor { - dl_tensor: cuvs_sys::DLTensor { - data, - device: cuvs_sys::DLDevice { - device_type: cuvs_sys::DLDeviceType::kDLCPU, - device_id: 0, - }, - ndim: shape.len() as i32, - dtype: T::dl_dtype(), - shape: shape.as_ptr() as *mut i64, - strides: ptr::null_mut(), - byte_offset: 0, - }, - manager_ctx: ptr::null_mut(), - deleter: None, - }; - Self { shape, tensor } - } - - fn as_mut_ptr(&mut self) -> *mut cuvs_sys::DLManagedTensor { - debug_assert_eq!(self.shape.len(), self.tensor.dl_tensor.ndim as usize); - &mut self.tensor - } -} - -trait DlElement: Copy + Default { - fn dl_dtype() -> cuvs_sys::DLDataType; -} - -impl DlElement for f32 { - fn dl_dtype() -> cuvs_sys::DLDataType { - cuvs_sys::DLDataType { - code: cuvs_sys::DLDataTypeCode::kDLFloat as u8, - bits: 32, - lanes: 1, - } - } -} - -impl DlElement for u8 { - fn dl_dtype() -> cuvs_sys::DLDataType { - cuvs_sys::DLDataType { - code: cuvs_sys::DLDataTypeCode::kDLUInt as u8, - bits: 8, - lanes: 1, - } - } -} - -impl DlElement for u32 { - fn dl_dtype() -> cuvs_sys::DLDataType { - cuvs_sys::DLDataType { - code: cuvs_sys::DLDataTypeCode::kDLUInt as u8, - bits: 32, - lanes: 1, - } - } -} - -struct DeviceTensor { - shape: Vec, - tensor: cuvs_sys::DLManagedTensor, - capacity_bytes: usize, - resources: cuvs_sys::cuvsResources_t, - _marker: PhantomData, -} - -impl DeviceTensor { - fn try_new(resources: &Resources, shape: &[usize]) -> Result { - let capacity_bytes = shape.iter().product::() * std::mem::size_of::(); - let mut data = ptr::null_mut(); - check_cuvs( - unsafe { cuvs_sys::cuvsRMMAlloc(resources.0, &mut data, capacity_bytes) }, - "allocate device tensor", - )?; - let shape = shape.iter().map(|dim| *dim as i64).collect::>(); - let tensor = cuvs_sys::DLManagedTensor { - dl_tensor: cuvs_sys::DLTensor { - data, - device: cuvs_sys::DLDevice { - device_type: cuvs_sys::DLDeviceType::kDLCUDA, - device_id: 0, - }, - ndim: shape.len() as i32, - dtype: T::dl_dtype(), - shape: shape.as_ptr() as *mut i64, - strides: ptr::null_mut(), - byte_offset: 0, - }, - manager_ctx: ptr::null_mut(), - deleter: None, - }; - Ok(Self { - shape, - tensor, - capacity_bytes, - resources: resources.0, - _marker: PhantomData, - }) - } - - fn as_mut_ptr(&mut self) -> *mut cuvs_sys::DLManagedTensor { - debug_assert_eq!(self.shape.len(), self.tensor.dl_tensor.ndim as usize); - &mut self.tensor - } - - fn set_shape(&mut self, shape: &[usize]) -> Result<()> { - if shape.len() != self.shape.len() { - return Err(Error::io(format!( - "device tensor rank mismatch: expected {}, got {}", - self.shape.len(), - shape.len() - ))); - } - let required_bytes = shape.iter().product::() * std::mem::size_of::(); - if required_bytes > self.capacity_bytes { - return Err(Error::io(format!( - "device tensor capacity {} bytes is smaller than requested shape {:?} ({} bytes)", - self.capacity_bytes, shape, required_bytes - ))); - } - for (dst, src) in self.shape.iter_mut().zip(shape) { - *dst = *src as i64; - } - Ok(()) - } - - fn current_len(&self) -> usize { - self.shape.iter().map(|dim| *dim as usize).product() - } - - fn current_bytes(&self) -> usize { - self.current_len() * std::mem::size_of::() - } - - fn copy_from_host_async(&mut self, resources: &Resources, src: &[T]) -> Result<()> { - let expected_len = self.current_len(); - if src.len() != expected_len { - return Err(Error::io(format!( - "device tensor copy expects {expected_len} elements, got {}", - src.len() - ))); - } - check_cuda( - unsafe { - cuvs_sys::cudaMemcpyAsync( - self.tensor.dl_tensor.data, - src.as_ptr() as *const _, - self.current_bytes(), - cuvs_sys::cudaMemcpyKind_cudaMemcpyDefault, - resources - .get_cuda_stream() - .map_err(|e| Error::io(e.to_string()))?, - ) - }, - "copy host tensor to device", - ) - } - - fn copy_to_host_async(&self, resources: &Resources, dst: &mut [T]) -> Result<()> { - let expected_len = self.current_len(); - if dst.len() != expected_len { - return Err(Error::io(format!( - "device tensor copy expects destination length {expected_len}, got {}", - dst.len() - ))); - } - check_cuda( - unsafe { - cuvs_sys::cudaMemcpyAsync( - dst.as_mut_ptr() as *mut _, - self.tensor.dl_tensor.data, - self.current_bytes(), - cuvs_sys::cudaMemcpyKind_cudaMemcpyDefault, - resources - .get_cuda_stream() - .map_err(|e| Error::io(e.to_string()))?, - ) - }, - "copy device tensor to host", - ) - } -} - -impl Drop for DeviceTensor { - fn drop(&mut self) { - if !self.tensor.dl_tensor.data.is_null() { - let _ = unsafe { - cuvs_sys::cuvsRMMFree( - self.resources, - self.tensor.dl_tensor.data, - self.capacity_bytes, - ) - }; - } - } -} - -struct PinnedHostBuffer { - ptr: *mut T, - len: usize, - _marker: PhantomData, -} - -impl PinnedHostBuffer { - fn try_new(len: usize) -> Result { - let bytes = len - .checked_mul(std::mem::size_of::()) - .ok_or_else(|| Error::io("pinned host allocation size overflow"))?; - let mut raw = ptr::null_mut(); - check_cuda( - unsafe { cudaMallocHost(&mut raw, bytes) }, - "allocate pinned host buffer", - )?; - Ok(Self { - ptr: raw.cast::(), - len, - _marker: PhantomData, - }) - } - - fn as_slice(&self) -> &[T] { - unsafe { std::slice::from_raw_parts(self.ptr, self.len) } - } - - fn as_mut_slice(&mut self) -> &mut [T] { - unsafe { std::slice::from_raw_parts_mut(self.ptr, self.len) } - } - - fn prefix(&self, len: usize) -> Result<&[T]> { - if len > self.len { - return Err(Error::io(format!( - "pinned host buffer length {} is smaller than requested prefix {}", - self.len, len - ))); - } - Ok(&self.as_slice()[..len]) - } - - fn prefix_mut(&mut self, len: usize) -> Result<&mut [T]> { - if len > self.len { - return Err(Error::io(format!( - "pinned host buffer length {} is smaller than requested prefix {}", - self.len, len - ))); - } - Ok(&mut self.as_mut_slice()[..len]) - } - - fn copy_from_slice(&mut self, src: &[T]) -> Result<()> { - if src.len() > self.len { - return Err(Error::io(format!( - "pinned host buffer length {} is smaller than source length {}", - self.len, - src.len() - ))); - } - self.prefix_mut(src.len())?.copy_from_slice(src); - Ok(()) - } -} - -impl Drop for PinnedHostBuffer { - fn drop(&mut self) { - if !self.ptr.is_null() { - let _ = unsafe { cudaFreeHost(self.ptr.cast::()) }; - } - } -} - -struct CudaEvent { - raw: CudaEventHandle, -} - -impl CudaEvent { - fn try_new() -> Result { - let mut raw = ptr::null_mut(); - check_cuda(unsafe { cudaEventCreate(&mut raw) }, "create CUDA event")?; - Ok(Self { raw }) - } - - fn record(&self, stream: cuvs_sys::cudaStream_t) -> Result<()> { - check_cuda( - unsafe { cudaEventRecord(self.raw, stream) }, - "record CUDA event", - ) - } - - fn synchronize(&self) -> Result<()> { - check_cuda( - unsafe { cudaEventSynchronize(self.raw) }, - "synchronize CUDA event", - ) - } - -} - -impl Drop for CudaEvent { - fn drop(&mut self) { - if !self.raw.is_null() { - let _ = unsafe { cudaEventDestroy(self.raw) }; - } - } -} - -fn check_cuvs(status: cuvs_sys::cuvsError_t, context: &str) -> Result<()> { - if status == cuvs_sys::cuvsError_t::CUVS_SUCCESS { - return Ok(()); - } - - let message = unsafe { - let text = cuvs_sys::cuvsGetLastErrorText(); - if text.is_null() { - format!("{status:?}") - } else { - format!( - "{status:?}: {}", - CStr::from_ptr(text).to_string_lossy().into_owned() - ) - } - }; - Err(Error::io(format!("cuVS failed to {context}: {message}"))) -} - -fn check_cuda(status: cuvs_sys::cudaError_t, context: &str) -> Result<()> { - if status == cuvs_sys::cudaError::cudaSuccess { - Ok(()) - } else { - Err(Error::io(format!("CUDA failed to {context}: {status:?}"))) - } -} - -fn cuvs_distance_type(metric_type: DistanceType) -> Result { - match metric_type { - DistanceType::L2 => Ok(cuvs_sys::cuvsDistanceType::L2Expanded), - DistanceType::Cosine => Ok(cuvs_sys::cuvsDistanceType::CosineExpanded), - DistanceType::Dot => Ok(cuvs_sys::cuvsDistanceType::InnerProduct), - other => Err(Error::not_supported(format!( - "cuVS IVF_PQ does not support metric {other:?}" - ))), - } -} - -fn create_index_params( - metric_type: DistanceType, - num_partitions: usize, - num_sub_vectors: usize, - sample_rate: usize, - max_iters: usize, - num_bits: usize, -) -> Result { - let mut params = ptr::null_mut(); - check_cuvs( - unsafe { cuvs_sys::cuvsIvfPqIndexParamsCreate(&mut params) }, - "allocate IVF_PQ index params", - )?; - let metric = cuvs_distance_type(metric_type)?; - unsafe { - (*params).metric = metric; - (*params).metric_arg = 0.0; - (*params).add_data_on_build = false; - (*params).n_lists = num_partitions as u32; - (*params).kmeans_n_iters = max_iters as u32; - (*params).kmeans_trainset_fraction = 1.0; - (*params).pq_bits = num_bits as u32; - (*params).pq_dim = num_sub_vectors as u32; - (*params).codebook_kind = - cuvs_sys::cuvsIvfPqCodebookGen::CUVS_IVF_PQ_CODEBOOK_GEN_PER_SUBSPACE; - (*params).force_random_rotation = false; - (*params).conservative_memory_allocation = false; - (*params).max_train_points_per_pq_code = sample_rate as u32; - (*params).codes_layout = cuvs_sys::cuvsIvfPqListLayout::CUVS_IVF_PQ_LIST_LAYOUT_FLAT; - } - Ok(params) -} - -fn destroy_index_params(params: cuvs_sys::cuvsIvfPqIndexParams_t) { - if !params.is_null() { - let _ = unsafe { cuvs_sys::cuvsIvfPqIndexParamsDestroy(params) }; - } -} - -fn make_tensor_view() -> HostTensorView { - let shape = Vec::new(); - let tensor = cuvs_sys::DLManagedTensor { - dl_tensor: cuvs_sys::DLTensor { - data: ptr::null_mut(), - device: cuvs_sys::DLDevice { - device_type: cuvs_sys::DLDeviceType::kDLCPU, - device_id: 0, - }, - ndim: 0, - dtype: ::dl_dtype(), - shape: shape.as_ptr() as *mut i64, - strides: ptr::null_mut(), - byte_offset: 0, - }, - manager_ctx: ptr::null_mut(), - deleter: None, - }; - HostTensorView { shape, tensor } -} - -fn tensor_shape(tensor: &cuvs_sys::DLManagedTensor) -> Vec { - let dl_tensor = &tensor.dl_tensor; - (0..dl_tensor.ndim) - .map(|idx| unsafe { *dl_tensor.shape.add(idx as usize) as usize }) - .collect() -} - -fn tensor_num_bytes(tensor: &cuvs_sys::DLManagedTensor) -> usize { - let shape = tensor_shape(tensor); - let numel = shape.into_iter().product::(); - numel * ((tensor.dl_tensor.dtype.bits as usize) / 8) -} - -fn copy_tensor_to_host_f32_2d( - resources: &Resources, - tensor: &cuvs_sys::DLManagedTensor, -) -> Result> { - let shape = tensor_shape(tensor); - if shape.len() != 2 { - return Err(Error::io(format!( - "expected 2D tensor, got shape {shape:?}" - ))); - } - let mut array = Array2::::zeros((shape[0], shape[1])); - check_cuda( - unsafe { - cuvs_sys::cudaMemcpyAsync( - array.as_mut_ptr() as *mut _, - tensor.dl_tensor.data, - tensor_num_bytes(tensor), - cuvs_sys::cudaMemcpyKind_cudaMemcpyDefault, - resources - .get_cuda_stream() - .map_err(|e| Error::io(e.to_string()))?, - ) - }, - "copy tensor to host", - )?; - resources - .sync_stream() - .map_err(|e| Error::io(e.to_string()))?; - Ok(array) -} - -fn copy_tensor_to_host_f32_3d( - resources: &Resources, - tensor: &cuvs_sys::DLManagedTensor, -) -> Result<(Vec, [usize; 3])> { - let shape = tensor_shape(tensor); - if shape.len() != 3 { - return Err(Error::io(format!( - "expected 3D tensor, got shape {shape:?}" - ))); - } - let mut values = vec![0.0f32; shape.iter().product()]; - check_cuda( - unsafe { - cuvs_sys::cudaMemcpyAsync( - values.as_mut_ptr() as *mut _, - tensor.dl_tensor.data, - tensor_num_bytes(tensor), - cuvs_sys::cudaMemcpyKind_cudaMemcpyDefault, - resources - .get_cuda_stream() - .map_err(|e| Error::io(e.to_string()))?, - ) - }, - "copy tensor to host", - )?; - resources - .sync_stream() - .map_err(|e| Error::io(e.to_string()))?; - Ok((values, [shape[0], shape[1], shape[2]])) -} - -fn infer_dimension(dataset: &Dataset, column: &str) -> Result { - let field = dataset.schema().field(column).ok_or_else(|| { - Error::invalid_input(format!( - "column '{column}' does not exist in dataset schema" - )) - })?; - infer_vector_dim(&field.data_type()) -} - -fn matrix_from_vectors<'a>(vectors: &'a FixedSizeListArray) -> Result> { - let dim = vectors.value_length() as usize; - match vectors.value_type() { - DataType::Float32 => { - let values = vectors.values().as_primitive::(); - let values: &[f32] = values.values().as_ref(); - Ok(MatrixBuffer::Borrowed { - values, - rows: vectors.len(), - cols: dim, - }) - } - DataType::Float16 => { - let values = vectors.values().as_primitive::(); - let data = values - .values() - .iter() - .map(|value| value.to_f32()) - .collect::>(); - Ok(MatrixBuffer::Owned( - Array2::from_shape_vec((vectors.len(), dim), data).map_err(|error| { - Error::io(format!("failed to create float16 matrix copy: {error}")) - })?, - )) - } - DataType::Float64 => { - let values = vectors.values().as_primitive::(); - let data = values - .values() - .iter() - .map(|value| *value as f32) - .collect::>(); - Ok(MatrixBuffer::Owned( - Array2::from_shape_vec((vectors.len(), dim), data).map_err(|error| { - Error::io(format!("failed to create float64 matrix copy: {error}")) - })?, - )) - } - other => Err(Error::not_supported(format!( - "cuVS IVF_PQ currently supports float16/float32/float64 vectors, got {other}" - ))), - } -} - -fn ivf_centroids_from_host(array: Array2) -> Result { - let dim = array.ncols() as i32; - let values = Float32Array::from_iter_values(array.into_iter()); - Ok(FixedSizeListArray::try_new_from_values(values, dim)?) -} - -fn pq_codebook_from_host( - values: Vec, - shape: [usize; 3], - num_sub_vectors: usize, - dimension: usize, - num_bits: usize, -) -> Result { - let pq_book_size = 1usize << num_bits; - let subvector_dim = dimension / num_sub_vectors; - let expected = [num_sub_vectors, subvector_dim, pq_book_size]; - if shape != expected { - return Err(Error::io(format!( - "cuVS returned incompatible PQ codebook shape: expected {expected:?}, got {shape:?}" - ))); - } - - let mut flattened = Vec::with_capacity(values.len()); - for subspace in 0..num_sub_vectors { - for centroid in 0..pq_book_size { - for component in 0..subvector_dim { - let source_idx = ((subspace * subvector_dim + component) * pq_book_size) + centroid; - flattened.push(values[source_idx]); - } - } - } - - Ok(FixedSizeListArray::try_new_from_values( - Float32Array::from(flattened), - subvector_dim as i32, - )?) -} - -fn build_metadata_batch( - ivf_centroids: &FixedSizeListArray, - pq_codebook: &FixedSizeListArray, -) -> Result { - let ivf_offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0i32, ivf_centroids.len() as i32])); - let pq_offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0i32, pq_codebook.len() as i32])); - let ivf_list = ListArray::new( - Arc::new(Field::new( - "_ivf_centroids_item", - ivf_centroids.data_type().clone(), - false, - )), - ivf_offsets, - Arc::new(ivf_centroids.clone()), - None, - ); - let pq_list = ListArray::new( - Arc::new(Field::new( - "_pq_codebook_item", - pq_codebook.data_type().clone(), - false, - )), - pq_offsets, - Arc::new(pq_codebook.clone()), - None, - ); - let schema = Arc::new(ArrowSchema::new(vec![ - Field::new("_ivf_centroids", ivf_list.data_type().clone(), false), - Field::new("_pq_codebook", pq_list.data_type().clone(), false), - ])); - Ok(RecordBatch::try_new( - schema, - vec![Arc::new(ivf_list), Arc::new(pq_list)], - )?) -} - -fn metadata_writer_options() -> Result { - Ok(FileWriterOptions { - format_version: Some( - PARTITION_ARTIFACT_FILE_VERSION - .parse::() - .map_err(|error| { - Error::invalid_input(format!( - "invalid partition artifact file version '{}': {}", - PARTITION_ARTIFACT_FILE_VERSION, error - )) - })?, - ), - ..Default::default() - }) -} - -async fn write_partition_artifact_metadata( - artifact_uri: &str, - trained: &TrainedIvfPqIndex, -) -> Result<()> { - let (object_store, root_dir) = lance::io::ObjectStore::from_uri(artifact_uri) - .await - .map_err(|error| Error::io(error.to_string()))?; - let path = root_dir.child(PARTITION_ARTIFACT_METADATA_FILE_NAME); - let batch = build_metadata_batch(&trained.ivf_centroids, &trained.pq_codebook)?; - let mut writer = FileWriter::try_new( - object_store.create(&path).await?, - lance_core::datatypes::Schema::try_from(batch.schema().as_ref())?, - metadata_writer_options()?, - )?; - writer.add_schema_metadata( - "lance:index_build:artifact_version".to_string(), - "1".to_string(), - ); - writer.add_schema_metadata( - "lance:index_build:distance_type".to_string(), - trained.metric_type.to_string(), - ); - writer.add_schema_metadata( - "lance:index_build:num_partitions".to_string(), - trained.num_partitions.to_string(), - ); - writer.add_schema_metadata( - "lance:index_build:num_sub_vectors".to_string(), - trained.num_sub_vectors.to_string(), - ); - writer.add_schema_metadata( - "lance:index_build:num_bits".to_string(), - trained.num_bits.to_string(), - ); - writer.add_schema_metadata( - "lance:index_build:dimension".to_string(), - trained.dimension.to_string(), - ); - writer.write_batch(&batch).await?; - writer.finish().await?; - Ok(()) -} - -fn build_partition_batch( - row_ids: Arc, - partitions: &[u32], - pq_codes: &[u8], - code_width: usize, -) -> Result { - if pq_codes.len() != partitions.len() * code_width { - return Err(Error::io(format!( - "partition artifact batch expects {} PQ codes for {} rows and code width {}, got {}", - partitions.len() * code_width, - partitions.len(), - code_width, - pq_codes.len() - ))); - } - let schema = Arc::new(ArrowSchema::new(vec![ - Field::new(ROW_ID, DataType::UInt64, false), - Field::new(PART_ID_COLUMN, DataType::UInt32, false), - Field::new( - PQ_CODE_COLUMN, - DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::UInt8, true)), - code_width as i32, - ), - true, - ), - ])); - let pq_codes = FixedSizeListArray::try_new_from_values( - UInt8Array::from_iter_values(pq_codes.iter().copied()), - code_width as i32, - )?; - Ok(RecordBatch::try_new( - schema, - vec![ - row_ids, - Arc::new(UInt32Array::from_iter_values(partitions.iter().copied())), - Arc::new(pq_codes), - ], - )?) -} - -fn transform_batch_loss(batch: &RecordBatch) -> f64 { - batch - .metadata() - .get(LOSS_METADATA_KEY) - .and_then(|value| value.parse::().ok()) - .unwrap_or(0.0) -} - -struct TransformSlot { - input_host: PinnedHostBuffer, - input_device: DeviceTensor, - labels_host: PinnedHostBuffer, - labels_device: DeviceTensor, - codes_host: PinnedHostBuffer, - codes_device: DeviceTensor, - h2d_start: CudaEvent, - h2d_done: CudaEvent, - transform_done: CudaEvent, - output_ready: CudaEvent, - row_ids: Option>, - rows: usize, -} - -impl TransformSlot { - fn try_new( - resources: &Resources, - max_rows: usize, - dimension: usize, - code_width: usize, - ) -> Result { - Ok(Self { - input_host: PinnedHostBuffer::try_new(max_rows * dimension)?, - input_device: DeviceTensor::try_new(resources, &[max_rows, dimension])?, - labels_host: PinnedHostBuffer::try_new(max_rows)?, - labels_device: DeviceTensor::try_new(resources, &[max_rows])?, - codes_host: PinnedHostBuffer::try_new(max_rows * code_width)?, - codes_device: DeviceTensor::try_new(resources, &[max_rows, code_width])?, - h2d_start: CudaEvent::try_new()?, - h2d_done: CudaEvent::try_new()?, - transform_done: CudaEvent::try_new()?, - output_ready: CudaEvent::try_new()?, - row_ids: None, - rows: 0, - }) - } - - fn has_pending_output(&self) -> bool { - self.row_ids.is_some() - } - - fn launch( - &mut self, - trained: &TrainedIvfPqIndex, - stream: cuvs_sys::cudaStream_t, - row_ids: Arc, - matrix: &[f32], - rows: usize, - dimension: usize, - ) -> Result<()> { - let code_width = trained.pq_code_width(); - self.input_host.copy_from_slice(matrix)?; - self.input_device.set_shape(&[rows, dimension])?; - self.labels_device.set_shape(&[rows])?; - self.codes_device.set_shape(&[rows, code_width])?; - self.rows = rows; - self.row_ids = Some(row_ids); - - self.h2d_start.record(stream)?; - self.input_device.copy_from_host_async( - &trained.resources, - self.input_host.prefix(rows * dimension)?, - )?; - self.h2d_done.record(stream)?; - check_cuvs( - unsafe { - cuvs_sys::cuvsIvfPqTransform( - trained.resources.0, - trained.index.raw, - self.input_device.as_mut_ptr(), - self.labels_device.as_mut_ptr(), - self.codes_device.as_mut_ptr(), - ) - }, - "transform vectors with IVF_PQ", - )?; - self.transform_done.record(stream)?; - self.labels_device - .copy_to_host_async(&trained.resources, self.labels_host.prefix_mut(rows)?)?; - self.codes_device.copy_to_host_async( - &trained.resources, - self.codes_host.prefix_mut(rows * code_width)?, - )?; - self.output_ready.record(stream)?; - Ok(()) - } - - fn drain_to_batch(&mut self, code_width: usize) -> Result> { - if !self.has_pending_output() { - return Ok(None); - } - - self.output_ready.synchronize()?; - let row_ids = self - .row_ids - .take() - .ok_or_else(|| Error::io("transform slot is missing row ids"))?; - let batch = build_partition_batch( - row_ids, - self.labels_host.prefix(self.rows)?, - self.codes_host.prefix(self.rows * code_width)?, - code_width, - )?; - self.rows = 0; - Ok(Some(batch)) - } -} - -async fn for_each_transformed_batch( - dataset: &Dataset, - column: &str, - trained: &TrainedIvfPqIndex, - batch_size: usize, - filter_nan: bool, - mut on_batch: F, -) -> Result<()> -where - F: FnMut(RecordBatch) -> Fut, - Fut: std::future::Future>, -{ - let code_width = trained.pq_code_width(); - let mut scanner = dataset.scan(); - scanner.project(&[column])?; - if dataset - .schema() - .field(column) - .is_some_and(|field| field.nullable && filter_nan) - { - scanner.filter(&format!("{column} is not null"))?; - } - scanner.with_row_id(); - scanner.batch_size(batch_size); - let mut stream = scanner.try_into_stream().await?; - let cuda_stream = trained - .resources - .get_cuda_stream() - .map_err(|error| Error::io(error.to_string()))?; - let mut slots = (0..PIPELINE_SLOTS) - .map(|_| { - TransformSlot::try_new( - &trained.resources, - batch_size, - trained.dimension, - code_width, - ) - }) - .collect::>>()?; - let mut next_slot = 0usize; - - loop { - let Some(batch) = stream.try_next().await? else { - break; - }; - let slot = &mut slots[next_slot]; - if let Some(transformed) = slot.drain_to_batch(code_width)? { - on_batch(transformed).await?; - } - - let vectors = vector_column_to_fsl(&batch, column)?; - let row_ids = batch - .column_by_name(ROW_ID) - .ok_or_else(|| Error::invalid_input(format!("transform batch is missing {ROW_ID}")))?; - let finite_mask = is_finite(&vectors); - let valid_rows = finite_mask.true_count(); - if valid_rows == 0 { - continue; - } - if valid_rows != vectors.len() { - warn!( - "{} vectors are ignored during partition assignment because they are null or non-finite", - vectors.len() - valid_rows - ); - } - - let filtered_row_ids = if valid_rows == row_ids.len() { - row_ids.clone() - } else { - filter(row_ids.as_ref(), &finite_mask)? - }; - let filtered_vectors = if valid_rows == vectors.len() { - vectors - } else { - let vector_column = batch.column_by_name(column).ok_or_else(|| { - Error::invalid_input(format!( - "transform batch is missing vector column '{column}'" - )) - })?; - let field = batch - .schema() - .field_with_name(column) - .map_err(|_| { - Error::invalid_input(format!( - "transform batch schema is missing field '{column}'" - )) - })? - .clone(); - let filtered_vectors = filter(vector_column.as_ref(), &finite_mask)?; - vector_column_to_fsl( - &RecordBatch::try_new( - Arc::new(ArrowSchema::new(vec![field])), - vec![filtered_vectors], - )?, - column, - )? - }; - - let matrix = matrix_from_vectors(&filtered_vectors)?; - let matrix_view = matrix.view()?; - let input_slice = matrix_view - .as_slice_memory_order() - .ok_or_else(|| Error::io("transform matrix is not contiguous"))?; - - slot.launch( - trained, - cuda_stream, - filtered_row_ids, - input_slice, - matrix.rows(), - matrix_view.ncols(), - )?; - next_slot = (next_slot + 1) % PIPELINE_SLOTS; - } - - for slot in &mut slots { - if let Some(transformed) = slot.drain_to_batch(code_width)? { - on_batch(transformed).await?; - } - } - Ok(()) -} - -pub async fn train_ivf_pq( - dataset: &Dataset, - column: &str, - num_partitions: usize, - metric_type: DistanceType, - num_sub_vectors: usize, - sample_rate: usize, - max_iters: usize, - num_bits: usize, - filter_nan: bool, -) -> Result { - if num_bits != 8 { - return Err(Error::not_supported( - "cuVS IVF_PQ currently supports only num_bits=8", - )); - } - - let dimension = infer_dimension(dataset, column)?; - if dimension % num_sub_vectors != 0 { - return Err(Error::invalid_input(format!( - "cuVS IVF_PQ requires vector dimension {} to be divisible by num_sub_vectors {}", - dimension, num_sub_vectors - ))); - } - - let num_rows = dataset.count_rows(None).await?; - if num_rows == 0 { - return Err(Error::invalid_input( - "cuVS training requires at least one training vector", - )); - } - let train_rows = num_rows - .min((num_partitions * sample_rate).max(256 * 256)) - .max(1); - let train_vectors = if filter_nan { - let batch = dataset.scan().project(&[column])?.try_into_batch().await?; - let vectors = vector_column_to_fsl(&batch, column)?; - let mask = is_finite(&vectors); - let filtered = filter(&vectors, &mask)?.as_fixed_size_list().clone(); - filtered.slice(0, train_rows.min(filtered.len())) - } else { - let projection = dataset.schema().project(&[column])?; - let batch = dataset.sample(train_rows, &projection, None).await?; - vector_column_to_fsl(&batch, column)? - }; - if train_vectors.is_empty() { - return Err(Error::invalid_input( - "cuVS training requires at least one non-null training vector", - )); - } - - let matrix = matrix_from_vectors(&train_vectors)?; - let resources = Resources::new().map_err(|error| Error::io(error.to_string()))?; - let index = CuvsIvfPqIndex::try_new()?; - let params = create_index_params( - metric_type, - num_partitions, - num_sub_vectors, - sample_rate, - max_iters, - num_bits, - )?; - let matrix_view = matrix.view()?; - let mut dataset_tensor = HostTensorView::try_new::( - &[matrix_view.nrows(), matrix_view.ncols()], - matrix_view.as_ptr() as *mut std::ffi::c_void, - ); - - let build_result = check_cuvs( - unsafe { - cuvs_sys::cuvsIvfPqBuild(resources.0, params, dataset_tensor.as_mut_ptr(), index.raw) - }, - "build IVF_PQ index", - ); - destroy_index_params(params); - build_result?; - - let mut centers = make_tensor_view(); - check_cuvs( - unsafe { cuvs_sys::cuvsIvfPqIndexGetCenters(index.raw, centers.as_mut_ptr()) }, - "get IVF centroids", - )?; - let ivf_centroids = - ivf_centroids_from_host(copy_tensor_to_host_f32_2d(&resources, ¢ers.tensor)?)?; - - let mut pq_centers = make_tensor_view(); - check_cuvs( - unsafe { cuvs_sys::cuvsIvfPqIndexGetPqCenters(index.raw, pq_centers.as_mut_ptr()) }, - "get PQ codebook", - )?; - let (pq_codebook_values, pq_codebook_shape) = - copy_tensor_to_host_f32_3d(&resources, &pq_centers.tensor)?; - let pq_codebook = pq_codebook_from_host( - pq_codebook_values, - pq_codebook_shape, - num_sub_vectors, - dimension, - num_bits, - )?; - - Ok(TrainedIvfPqIndex { - resources, - index, - num_partitions, - dimension, - num_sub_vectors, - num_bits, - metric_type, - ivf_centroids, - pq_codebook, - }) -} - -pub async fn assign_ivf_pq_to_artifact( - dataset: &Dataset, - column: &str, - trained: &TrainedIvfPqIndex, - artifact_uri: &str, - batch_size: usize, - filter_nan: bool, -) -> Result> { - let code_width = trained.pq_code_width(); - let builder = Arc::new(tokio::sync::Mutex::new( - PartitionArtifactBuilder::try_new(artifact_uri, trained.num_partitions, code_width, None) - .await?, - )); - for_each_transformed_batch(dataset, column, trained, batch_size, filter_nan, |batch| { - let builder = builder.clone(); - async move { - builder.lock().await.append_batch(&batch).await?; - Ok(()) - } - }) - .await?; - let mut builder = Arc::try_unwrap(builder) - .map_err(|_| Error::io("partition artifact builder still has outstanding references"))? - .into_inner(); - - write_partition_artifact_metadata(artifact_uri, trained).await?; - let mut files = builder.finish(PARTITION_ARTIFACT_METADATA_FILE_NAME, None).await?; - if files.len() > 1 { - files.insert(1, PARTITION_ARTIFACT_METADATA_FILE_NAME.to_string()); - } - Ok(files) -} From 578f789526cb6ff14ebc431ec50930053a90964d Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Wed, 8 Apr 2026 19:52:27 +0800 Subject: [PATCH 16/21] refactor: remove in-tree cuvs integration --- AGENTS.md | 6 - python/DEVELOPMENT.md | 16 - python/python/lance/cuvs.py | 637 ------------------ python/python/lance/dataset.py | 189 ++---- python/python/lance/indices/builder.py | 33 +- python/python/lance/lance/__init__.pyi | 14 - python/python/tests/test_vector_index.py | 368 +--------- python/src/dataset.rs | 4 - python/src/file.rs | 77 --- python/src/lib.rs | 4 +- rust/lance-index/src/vector/ivf/builder.rs | 6 - rust/lance/src/index/vector.rs | 2 - rust/lance/src/index/vector/builder.rs | 27 - .../lance/src/index/vector/encoded_dataset.rs | 370 ---------- rust/lance/src/index/vector/ivf.rs | 35 - 15 files changed, 106 insertions(+), 1682 deletions(-) delete mode 100644 python/python/lance/cuvs.py delete mode 100644 rust/lance/src/index/vector/encoded_dataset.rs diff --git a/AGENTS.md b/AGENTS.md index ec2b3e21773..8543d23521a 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -53,12 +53,6 @@ cd test_data && docker compose up -d AWS_DEFAULT_REGION=us-east-1 pytest --run-integration python/tests/test_s3_ddb.py ``` -### Benchmarking Discipline - -- Benchmark machines must use release builds only. For Python bindings, always run `maturin develop --release` before collecting any timing data. -- Never use `maturin develop` without `--release` on a benchmark host. If a dev-profile rebuild is needed for functional debugging, use a different machine or clearly discard all performance results collected afterwards. -- Before trusting a benchmark result, verify the mounted benchmark volume and the active build profile. - ## Coding Standards ### General diff --git a/python/DEVELOPMENT.md b/python/DEVELOPMENT.md index 21dba0bdddd..12c56549608 100644 --- a/python/DEVELOPMENT.md +++ b/python/DEVELOPMENT.md @@ -8,22 +8,6 @@ uv sync --extra tests --extra dev Add extras such as `benchmarks`, `torch`, or `geo` only when you need them. After the environment is initialized, either activate it or use `uv run ...` for commands. -`accelerator="cuvs"` does not have a normal project extra today. cuVS Python -packages are published per CUDA major version and are typically installed from -NVIDIA's package index, for example: - -```shell -uv pip install --extra-index-url https://pypi.nvidia.com cuvs-cu12 -``` - -or: - -```shell -uv pip install --extra-index-url https://pypi.nvidia.com cuvs-cu13 -``` - -Pick the package that matches the CUDA version in your environment. - `uv sync` is not just downloading Python packages here. It also builds the local `pylance` Rust extension as part of the editable environment, so the first run, cache misses, or Rust dependency changes can make it noticeably slow. This is expected; let the build finish instead of interrupting it and switching to a different environment setup. ## Building the project diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py deleted file mode 100644 index c32dc12b55c..00000000000 --- a/python/python/lance/cuvs.py +++ /dev/null @@ -1,637 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright The Lance Authors - -from __future__ import annotations - -import json -import re -import tempfile -from importlib import import_module -from typing import TYPE_CHECKING, Iterator, Tuple - -import pyarrow as pa -import pyarrow.compute as pc - -from .file import LanceFileSession -from .lance import PartitionArtifactBuilder -from .dependencies import numpy as np -from .log import LOGGER -from .util import _normalize_metric_type - -if TYPE_CHECKING: - from pathlib import Path - -PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY = ( - "lance:index_build:precomputed_encoded_partition_sizes" -) -PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY = ( - "lance:index_build:precomputed_encoded_partition_fragment_ids" -) -PRECOMPUTED_ENCODED_TOTAL_LOSS_METADATA_KEY = ( - "lance:index_build:precomputed_encoded_total_loss" -) - -PARTITION_ARTIFACT_MANIFEST_VERSION = 1 -PARTITION_ARTIFACT_MANIFEST_FILE_NAME = "manifest.json" -PARTITION_ARTIFACT_METADATA_FILE_NAME = "metadata.lance" -PARTITION_ARTIFACT_PARTITIONS_DIR = "partitions" -DEFAULT_PARTITION_ARTIFACT_BUCKETS = 256 -PARTITION_ARTIFACT_ROW_ID_COLUMN = "_rowid" - -def build_vector_index_on_cuvs( - dataset, - column: str, - metric_type: str, - accelerator: str, - num_partitions: int, - num_sub_vectors: int, - dst_dataset_uri: str | Path | None = None, - *, - sample_rate: int = 256, - max_iters: int = 50, - num_bits: int = 8, - batch_size: int = 1024 * 128, - filter_nan: bool = True, -): - if dst_dataset_uri is None: - dst_dataset_uri = tempfile.mkdtemp() - - trained_index, ivf_centroids, pq_codebook = _train_ivf_pq_index_on_cuvs( - dataset, - column, - num_partitions, - metric_type, - accelerator, - num_sub_vectors=num_sub_vectors, - sample_rate=sample_rate, - max_iters=max_iters, - num_bits=num_bits, - filter_nan=filter_nan, - ) - artifact_root, artifact_files = one_pass_assign_ivf_pq_on_cuvs( - dataset, - column, - metric_type, - accelerator, - ivf_centroids, - pq_codebook, - trained_index=trained_index, - dst_dataset_uri=dst_dataset_uri, - batch_size=batch_size, - filter_nan=filter_nan, - ) - return artifact_root, artifact_files, ivf_centroids, pq_codebook - - -def is_cuvs_accelerator(accelerator: object) -> bool: - return accelerator == "cuvs" - - -def _require_cuvs(): - try: - return import_module("cuvs.neighbors.ivf_pq") - except ModuleNotFoundError as exc: - raise ModuleNotFoundError( - "accelerator='cuvs' requires cuVS Python bindings to be installed. " - "Install a CUDA-matched package such as 'cuvs-cu12' or 'cuvs-cu13' " - "from https://pypi.nvidia.com." - ) from exc - - -def _optional_cupy(): - try: - return import_module("cupy") - except ModuleNotFoundError: - return None - - -def _make_progress(total: int): - try: - from tqdm.auto import tqdm - - return tqdm(total=total) - except ModuleNotFoundError: - - class _NoOpProgress: - def set_description(self, _description: str): - return None - - def update(self, _count: int): - return None - - def close(self): - return None - - return _NoOpProgress() - - -def _metric_to_cuvs(metric_type: str) -> str: - metric_type = _normalize_metric_type(metric_type).lower() - if metric_type in {"l2", "euclidean"}: - return "sqeuclidean" - if metric_type == "dot": - return "inner_product" - if metric_type == "cosine": - return "cosine" - raise ValueError(f"Metric '{metric_type}' is not supported by cuVS IVF_PQ") - - -def _coerce_float_matrix(matrix: np.ndarray, *, column: str) -> np.ndarray: - if matrix.ndim != 2: - raise ValueError( - f"Expected a 2D training matrix for column '{column}', got {matrix.shape}" - ) - if matrix.dtype == np.float64: - matrix = matrix.astype(np.float32) - elif matrix.dtype not in (np.float16, np.float32): - matrix = matrix.astype(np.float32) - return matrix - - -def _column_to_numpy(table: pa.Table | pa.RecordBatch, column: str) -> np.ndarray: - array = table.column(column) - if isinstance(array, pa.ChunkedArray): - array = array.combine_chunks() - if len(array) == 0: - raise ValueError("cuVS training requires at least one training vector") - - if pa.types.is_fixed_size_list(array.type): - values = array.values.to_numpy(zero_copy_only=False) - matrix = values.reshape(len(array), array.type.list_size) - return _coerce_float_matrix(matrix, column=column) - - values = array.to_pylist() - return _coerce_float_matrix(np.asarray(values), column=column) - - -def _annotate_precomputed_encoded_dataset( - dataset, - partition_sizes: list[int], - *, - total_loss: float | None = None, -) -> None: - partition_fragments = [[] for _ in range(len(partition_sizes))] - for fragment in dataset.get_fragments(): - fragment_partitions = set() - scanner = fragment.scanner(columns=["__ivf_part_id"]) - for batch in scanner.to_batches(): - fragment_partitions.update( - int(partition_id) - for partition_id in np.unique( - batch.column("__ivf_part_id").to_numpy(zero_copy_only=False) - ) - ) - for partition_id in fragment_partitions: - partition_fragments[partition_id].append(int(fragment.metadata.id)) - - metadata = { - PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY: json.dumps( - [int(size) for size in partition_sizes] - ), - PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY: json.dumps( - partition_fragments - ), - } - if total_loss is not None: - metadata[PRECOMPUTED_ENCODED_TOTAL_LOSS_METADATA_KEY] = json.dumps( - float(total_loss) - ) - dataset.update_metadata(metadata) - - -def _as_numpy(array_like) -> np.ndarray: - if isinstance(array_like, np.ndarray): - return array_like - - if hasattr(array_like, "copy_to_host"): - return np.asarray(array_like.copy_to_host()) - - try: - array = np.asarray(array_like) - if isinstance(array, np.ndarray): - return array - except Exception: - pass - - if hasattr(array_like, "get"): - return np.asarray(array_like.get()) - - cupy = _optional_cupy() - if cupy is not None: - return cupy.asnumpy(array_like) - - raise TypeError("Unable to convert cuVS output to numpy") - - -def _normalize_artifact_root(path_or_uri: str | Path) -> str: - root = str(path_or_uri) - if re.search(r".:\\", root) is not None: - root = root.replace("\\", "/", 1) - return root - - -def _make_metadata_table( - ivf_centroids: np.ndarray, - pq_codebook: np.ndarray, -) -> pa.Table: - dimension = ivf_centroids.shape[1] - subvector_dim = pq_codebook.shape[2] - ivf_type = pa.list_(pa.list_(pa.float32(), dimension)) - pq_type = pa.list_(pa.list_(pa.float32(), subvector_dim)) - ivf_values = pa.array([ivf_centroids.tolist()], type=ivf_type) - pq_values = pa.array( - [pq_codebook.reshape(-1, subvector_dim).tolist()], - type=pq_type, - ) - return pa.Table.from_arrays( - [ivf_values, pq_values], - names=["_ivf_centroids", "_pq_codebook"], - ) - - -def _write_partition_artifact_metadata( - session: LanceFileSession, - *, - ivf_centroids: np.ndarray, - pq_codebook: np.ndarray, - metric_type: str, - num_bits: int, -) -> None: - metadata_table = _make_metadata_table(ivf_centroids, pq_codebook) - with session.open_writer( - PARTITION_ARTIFACT_METADATA_FILE_NAME, - schema=metadata_table.schema, - version="2.2", - ) as writer: - writer.add_schema_metadata("lance:index_build:artifact_version", "1") - writer.add_schema_metadata( - "lance:index_build:distance_type", _normalize_metric_type(metric_type) - ) - writer.add_schema_metadata( - "lance:index_build:num_partitions", str(ivf_centroids.shape[0]) - ) - writer.add_schema_metadata( - "lance:index_build:num_sub_vectors", str(pq_codebook.shape[0]) - ) - writer.add_schema_metadata("lance:index_build:num_bits", str(num_bits)) - writer.add_schema_metadata("lance:index_build:dimension", str(ivf_centroids.shape[1])) - writer.write_batch(metadata_table) - - -def _write_partition_artifact( - batches: Iterator[pa.RecordBatch], - *, - artifact_root: str | Path, - ivf_centroids: np.ndarray, - pq_codebook: np.ndarray, - metric_type: str, - num_bits: int, - num_partitions: int, - total_loss: float | None = None, -) -> tuple[str, list[str]]: - artifact_root = _normalize_artifact_root(artifact_root) - session = LanceFileSession(artifact_root) - builder = PartitionArtifactBuilder( - artifact_root, - num_partitions=num_partitions, - pq_code_width=pq_codebook.shape[0], - ) - for batch in batches: - builder.append_batch(batch) - - _write_partition_artifact_metadata( - session, - ivf_centroids=ivf_centroids, - pq_codebook=pq_codebook, - metric_type=metric_type, - num_bits=num_bits, - ) - artifact_files = builder.finish( - PARTITION_ARTIFACT_METADATA_FILE_NAME, - float(total_loss) if total_loss is not None else None, - ) - artifact_files.insert(1, PARTITION_ARTIFACT_METADATA_FILE_NAME) - return artifact_root, artifact_files - - -def _to_cuvs_transform_input(matrix: np.ndarray): - cupy = _optional_cupy() - if cupy is None: - raise ModuleNotFoundError( - "accelerator='cuvs' full index build requires the 'cupy' package " - "to pass transform batches in device memory" - ) - return cupy.asarray(matrix) - - -def _normalize_centroids(index, num_partitions: int, dimension: int) -> np.ndarray: - centroids = _as_numpy(index.centers) - if centroids.shape != (num_partitions, dimension): - raise ValueError( - "cuVS returned incompatible IVF centroids shape: " - f"expected {(num_partitions, dimension)}, got {centroids.shape}" - ) - return centroids - - -def _normalize_pq_codebook( - index, num_sub_vectors: int, num_bits: int, dimension: int -) -> np.ndarray: - pq_book_size = 1 << num_bits - subvector_dim = dimension // num_sub_vectors - pq_centers = _as_numpy(index.pq_centers) - - expected_shapes = { - (subvector_dim, num_sub_vectors, pq_book_size): (1, 2, 0), - (num_sub_vectors, subvector_dim, pq_book_size): (0, 2, 1), - (num_sub_vectors, pq_book_size, subvector_dim): None, - } - transpose = expected_shapes.get(pq_centers.shape) - if transpose is None and pq_centers.shape not in expected_shapes: - raise ValueError( - "cuVS returned incompatible PQ codebook shape: expected one of " - f"{list(expected_shapes.keys())}, got {pq_centers.shape}" - ) - if transpose is not None: - pq_centers = np.transpose(pq_centers, transpose) - return pq_centers - - -def _estimate_trainset_fraction( - num_rows: int, num_partitions: int, sample_rate: int -) -> float: - if num_rows <= 0: - raise ValueError("cuVS training requires a non-empty dataset") - desired_rows = max(num_partitions * sample_rate, 256 * 256) - return min(1.0, desired_rows / num_rows) - - -def _sample_training_table( - dataset, column: str, train_rows: int, filt: str | None -) -> pa.Table: - if filt is None: - return dataset.sample(train_rows, columns=[column], randomize_order=True) - - total_rows = dataset.count_rows() - sample_rows = min(total_rows, max(train_rows * 2, train_rows + 1024)) - trainset = dataset.sample(sample_rows, columns=[column], randomize_order=True) - trainset = trainset.filter(pc.is_valid(trainset.column(column))) - if len(trainset) >= train_rows or sample_rows == total_rows: - return trainset.slice(0, min(train_rows, len(trainset))) - - return dataset.to_table(columns=[column], filter=filt, limit=train_rows) - - -def _train_ivf_pq_index_on_cuvs( - dataset, - column: str, - num_partitions: int, - metric_type: str, - accelerator: str, - num_sub_vectors: int, - *, - sample_rate: int = 256, - max_iters: int = 50, - num_bits: int = 8, - filter_nan: bool = True, -): - if accelerator != "cuvs": - raise ValueError("cuVS acceleration only supports accelerator='cuvs'") - if num_bits != 8: - raise ValueError("cuVS IVF_PQ integration currently supports only num_bits=8") - - dimension = dataset.schema.field(column).type.list_size - if dimension % num_sub_vectors != 0: - raise ValueError( - "cuVS IVF_PQ integration requires vector dimension to be divisible by " - "num_sub_vectors" - ) - - if dataset.schema.field(column).nullable and filter_nan: - filt = f"{column} is not null" - else: - filt = None - - num_rows = dataset.count_rows(filter=filt) - if num_rows == 0: - raise ValueError("cuVS training requires at least one non-null training vector") - - train_rows = max(1, min(num_rows, max(num_partitions * sample_rate, 256 * 256))) - trainset = _sample_training_table(dataset, column, train_rows, filt) - matrix = _column_to_numpy(trainset, column) - - ivf_pq = _require_cuvs() - build_params = ivf_pq.IndexParams( - n_lists=num_partitions, - metric=_metric_to_cuvs(metric_type), - kmeans_n_iters=max_iters, - kmeans_trainset_fraction=_estimate_trainset_fraction( - matrix.shape[0], num_partitions, sample_rate - ), - pq_bits=num_bits, - pq_dim=num_sub_vectors, - codebook_kind="subspace", - force_random_rotation=False, - add_data_on_build=False, - ) - - index = ivf_pq.build(build_params, matrix) - centroids = _normalize_centroids(index, num_partitions, dimension) - pq_codebook = _normalize_pq_codebook(index, num_sub_vectors, num_bits, dimension) - return index, centroids, pq_codebook - - -def one_pass_assign_ivf_pq_on_cuvs( - dataset, - column: str, - metric_type: str, - accelerator: str, - ivf_centroids: np.ndarray, - pq_codebook: np.ndarray, - trained_index=None, - dst_dataset_uri: str | Path | None = None, - batch_size: int = 1024 * 128, - *, - filter_nan: bool = True, -): - if accelerator != "cuvs": - raise ValueError("cuVS acceleration only supports accelerator='cuvs'") - - num_rows = dataset.count_rows() - if dataset.schema.field(column).nullable and filter_nan: - filt = f"{column} is not null" - else: - filt = None - - num_sub_vectors = pq_codebook.shape[0] - ivf_pq = _require_cuvs() - - if trained_index is None: - raise ValueError( - "one_pass_assign_ivf_pq_on_cuvs requires a trained cuVS index for " - "single-node transform" - ) - transform_code_width = (trained_index.pq_dim * trained_index.pq_bits + 7) // 8 - if transform_code_width != num_sub_vectors: - raise ValueError( - "cuVS transform output is incompatible with Lance IVF_PQ for this " - "configuration: expected " - f"{num_sub_vectors} PQ code columns, but cuVS will produce " - f"{transform_code_width}. Use a configuration where " - "ceil(pq_dim * pq_bits / 8) == num_sub_vectors." - ) - - progress = _make_progress(num_rows) - progress.set_description("Assigning partitions and computing pq codes") - num_partitions = ivf_centroids.shape[0] - partition_sizes = np.zeros(num_partitions, dtype=np.int64) - - output_schema = pa.schema( - [ - pa.field(PARTITION_ARTIFACT_ROW_ID_COLUMN, pa.uint64()), - pa.field("__ivf_part_id", pa.uint32()), - pa.field("__pq_code", pa.list_(pa.uint8(), list_size=num_sub_vectors)), - ] - ) - - def _partition_and_pq_codes_assignment() -> Iterator[pa.RecordBatch]: - for batch in dataset.to_batches( - columns=[column], - filter=filt, - with_row_id=True, - batch_size=batch_size, - ): - vectors = _column_to_numpy(batch, column) - row_ids = batch.column("_rowid").to_numpy() - valid_mask = np.isfinite(vectors).all(axis=1) - if not np.all(valid_mask): - LOGGER.warning( - "%s vectors are ignored during partition assignment", - len(valid_mask) - int(valid_mask.sum()), - ) - row_ids = row_ids[valid_mask] - vectors = vectors[valid_mask] - if len(row_ids) == 0: - continue - partitions, pq_codes = ivf_pq.transform( - trained_index, _to_cuvs_transform_input(vectors) - ) - partitions = _as_numpy(partitions).astype(np.uint32, copy=False) - partition_sizes[:] += np.bincount(partitions, minlength=num_partitions) - pq_codes = _as_numpy(pq_codes).astype(np.uint8, copy=False) - if pq_codes.shape != (len(row_ids), num_sub_vectors): - raise ValueError( - "cuVS transform returned incompatible PQ codes shape: " - f"expected {(len(row_ids), num_sub_vectors)}, got {pq_codes.shape}" - ) - - pq_values = pa.array(pq_codes.reshape(-1), type=pa.uint8()) - pq_code_array = pa.FixedSizeListArray.from_arrays( - pq_values, num_sub_vectors - ) - yield pa.RecordBatch.from_arrays( - [ - pa.array(row_ids, type=pa.uint64()), - pa.array(partitions, type=pa.uint32()), - pq_code_array, - ], - schema=output_schema, - ) - progress.update(len(row_ids)) - - if dst_dataset_uri is None: - dst_dataset_uri = tempfile.mkdtemp() - artifact_root, artifact_files = _write_partition_artifact( - _partition_and_pq_codes_assignment(), - artifact_root=dst_dataset_uri, - ivf_centroids=ivf_centroids, - pq_codebook=pq_codebook, - metric_type=metric_type, - num_bits=8, - num_partitions=num_partitions, - ) - - progress.close() - LOGGER.info("Saved precomputed partition artifact to %s", artifact_root) - return str(artifact_root), artifact_files - - -def train_ivf_pq_on_cuvs( - dataset, - column: str, - num_partitions: int, - metric_type: str, - accelerator: str, - num_sub_vectors: int, - *, - sample_rate: int = 256, - max_iters: int = 50, - num_bits: int = 8, - filter_nan: bool = True, -) -> Tuple[np.ndarray, np.ndarray]: - _, centroids, pq_codebook = _train_ivf_pq_index_on_cuvs( - dataset, - column, - num_partitions, - metric_type, - accelerator, - num_sub_vectors, - sample_rate=sample_rate, - max_iters=max_iters, - num_bits=num_bits, - filter_nan=filter_nan, - ) - return centroids, pq_codebook - - -def one_pass_train_ivf_pq_on_cuvs( - dataset, - column: str, - num_partitions: int, - metric_type: str, - accelerator: str, - num_sub_vectors: int, - *, - sample_rate: int = 256, - max_iters: int = 50, - num_bits: int = 8, - filter_nan: bool = True, -): - return train_ivf_pq_on_cuvs( - dataset, - column, - num_partitions, - metric_type, - accelerator, - num_sub_vectors, - sample_rate=sample_rate, - max_iters=max_iters, - num_bits=num_bits, - filter_nan=filter_nan, - ) - - -def prepare_global_ivf_pq_on_cuvs( - dataset, - column: str, - num_partitions: int, - num_sub_vectors: int, - *, - distance_type: str = "l2", - accelerator: str = "cuvs", - sample_rate: int = 256, - max_iters: int = 50, - num_bits: int = 8, -): - centroids, pq_codebook = train_ivf_pq_on_cuvs( - dataset, - column, - num_partitions, - distance_type, - accelerator, - num_sub_vectors, - sample_rate=sample_rate, - max_iters=max_iters, - num_bits=num_bits, - ) - return {"ivf_centroids": centroids, "pq_codebook": pq_codebook} diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index ee023048a1e..32e9e548d68 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -39,7 +39,6 @@ from lance.log import LOGGER from .blob import BlobFile -from .cuvs import is_cuvs_accelerator from .dependencies import ( _check_for_numpy, _check_for_torch, @@ -2900,24 +2899,27 @@ def _create_index_impl( # Handle timing for various parts of accelerated builds timers = {} - use_cuvs = is_cuvs_accelerator(accelerator) + if isinstance(accelerator, str) and accelerator.lower().startswith("cuvs"): + raise ValueError( + "accelerator='cuvs' is not built into Lance. " + "Use the external 'lance-cuvs' package to produce a " + "precomputed partition artifact and then call create_index " + "with precomputed_partition_artifact_uri." + ) if accelerator is not None and index_type != "IVF_PQ": - if use_cuvs: - raise ValueError( - f"accelerator='{accelerator}' only supports IVF_PQ index builds" - ) LOGGER.warning( "Index type %s does not support GPU acceleration; falling back to CPU", index_type, ) accelerator = None - use_cuvs = False # IMPORTANT: Distributed indexing is CPU-only. Enforce single-node when - # any Python-side accelerator path is selected. - accelerated_build_detected = accelerator is not None + # accelerator or torch-related paths are detected. + torch_detected = False try: - if accelerator is None: + if accelerator is not None: + torch_detected = True + else: impl = kwargs.get("implementation") use_torch_flag = kwargs.get("use_torch") is True one_pass_flag = kwargs.get("one_pass_ivfpq") is True @@ -2930,16 +2932,16 @@ def _create_index_impl( or torch_centroids or torch_codebook ): - accelerated_build_detected = True + torch_detected = True except Exception: # Be conservative: if detection fails, do not modify behavior pass - if accelerated_build_detected: + if torch_detected: if require_commit: if fragment_ids is not None or index_uuid is not None: LOGGER.info( - "Accelerated build detected; " + "Torch detected; " "enforce single-node indexing (distributed is CPU-only)." ) fragment_ids = None @@ -2947,7 +2949,7 @@ def _create_index_impl( else: if index_uuid is not None: LOGGER.info( - "Accelerated build detected; " + "Torch detected; " "enforce single-node indexing (distributed is CPU-only)." ) index_uuid = None @@ -2958,83 +2960,52 @@ def _create_index_impl( num_partitions = _target_partition_size_to_num_partitions( num_rows, target_partition_size ) + from .vector import ( + one_pass_assign_ivf_pq_on_accelerator, + one_pass_train_ivf_pq_on_accelerator, + ) - if use_cuvs: - from .cuvs import build_vector_index_on_cuvs - - LOGGER.info("Doing cuVS vector backend build") - timers["ivf+pq_build:start"] = time.time() - artifact_root, _, ivf_centroids, pq_codebook = build_vector_index_on_cuvs( - self, - column[0], - metric, - accelerator, - num_partitions, - num_sub_vectors, - sample_rate=kwargs.get("sample_rate", 256), - max_iters=kwargs.get("max_iters", 50), - num_bits=kwargs.get("num_bits", 8), - batch_size=1024 * 128, - filter_nan=filter_nan, - ) - kwargs["precomputed_partition_artifact_uri"] = artifact_root - timers["ivf+pq_build:end"] = time.time() - ivfpq_build_time = ( - timers["ivf+pq_build:end"] - timers["ivf+pq_build:start"] - ) - LOGGER.info("cuVS ivf+pq build time: %ss", ivfpq_build_time) - else: - from .vector import ( - one_pass_assign_ivf_pq_on_accelerator, - one_pass_train_ivf_pq_on_accelerator, - ) - - LOGGER.info("Doing one-pass ivfpq accelerated computations") - timers["ivf+pq_train:start"] = time.time() - ( - ivf_centroids, - ivf_kmeans, - pq_codebook, - pq_kmeans_list, - ) = one_pass_train_ivf_pq_on_accelerator( - self, - column[0], - num_partitions, - metric, - accelerator, - num_sub_vectors=num_sub_vectors, - batch_size=20480, - filter_nan=filter_nan, - ) - timers["ivf+pq_train:end"] = time.time() - ivfpq_train_time = ( - timers["ivf+pq_train:end"] - timers["ivf+pq_train:start"] - ) - LOGGER.info("ivf+pq training time: %ss", ivfpq_train_time) - timers["ivf+pq_assign:start"] = time.time() - ( - shuffle_output_dir, - shuffle_buffers, - ) = one_pass_assign_ivf_pq_on_accelerator( - self, - column[0], - metric, - accelerator, - ivf_kmeans, - pq_kmeans_list, - batch_size=20480, - filter_nan=filter_nan, - ) - timers["ivf+pq_assign:end"] = time.time() - ivfpq_assign_time = ( - timers["ivf+pq_assign:end"] - timers["ivf+pq_assign:start"] - ) - LOGGER.info("ivf+pq transform time: %ss", ivfpq_assign_time) + LOGGER.info("Doing one-pass ivfpq accelerated computations") + timers["ivf+pq_train:start"] = time.time() + ( + ivf_centroids, + ivf_kmeans, + pq_codebook, + pq_kmeans_list, + ) = one_pass_train_ivf_pq_on_accelerator( + self, + column[0], + num_partitions, + metric, + accelerator, + num_sub_vectors=num_sub_vectors, + batch_size=20480, + filter_nan=filter_nan, + ) + timers["ivf+pq_train:end"] = time.time() + ivfpq_train_time = timers["ivf+pq_train:end"] - timers["ivf+pq_train:start"] + LOGGER.info("ivf+pq training time: %ss", ivfpq_train_time) + timers["ivf+pq_assign:start"] = time.time() + shuffle_output_dir, shuffle_buffers = one_pass_assign_ivf_pq_on_accelerator( + self, + column[0], + metric, + accelerator, + ivf_kmeans, + pq_kmeans_list, + batch_size=20480, + filter_nan=filter_nan, + ) + timers["ivf+pq_assign:end"] = time.time() + ivfpq_assign_time = ( + timers["ivf+pq_assign:end"] - timers["ivf+pq_assign:start"] + ) + LOGGER.info("ivf+pq transform time: %ss", ivfpq_assign_time) - kwargs["precomputed_shuffle_buffers"] = shuffle_buffers - kwargs["precomputed_shuffle_buffers_path"] = os.path.join( - shuffle_output_dir, "data" - ) + kwargs["precomputed_shuffle_buffers"] = shuffle_buffers + kwargs["precomputed_shuffle_buffers_path"] = os.path.join( + shuffle_output_dir, "data" + ) if index_type.startswith("IVF"): if (ivf_centroids is not None) and (ivf_centroids_file is not None): raise ValueError( @@ -3190,13 +3161,6 @@ def _create_index_impl( "Temporary shuffle buffers stored at %s, you may want to delete it.", kwargs["precomputed_shuffle_buffers_path"], ) - if "precomputed_encoded_dataset_uri" in kwargs.keys() and os.path.exists( - kwargs["precomputed_encoded_dataset_uri"] - ): - LOGGER.info( - "Temporary precomputed encoded dataset stored at %s, you may want to delete it.", - kwargs["precomputed_encoded_dataset_uri"], - ) if "precomputed_partition_artifact_uri" in kwargs.keys() and os.path.exists( kwargs["precomputed_partition_artifact_uri"] ): @@ -3279,17 +3243,7 @@ def create_index( The number of sub-vectors for PQ (Product Quantization). accelerator : str or ``torch.Device``, optional If set, use an accelerator to speed up the training process. - Accepted accelerator: - - - "cuda" or ``torch.device(...)`` for the existing torch-based path - on NVIDIA GPUs - - "mps" for Apple Silicon GPU - - "cuvs" for the explicit cuVS-based IVF_PQ training path on NVIDIA - GPUs - - The cuVS path also requires the cuVS Python bindings to be installed - separately. - + Accepted accelerator: "cuda" (Nvidia GPU) and "mps" (Apple Silicon GPU). If not set, use the CPU. index_cache_size : int, optional The size of the index cache in number of entries. Default value is 256. @@ -3355,6 +3309,11 @@ def create_index( Only 4, 8 are supported. - index_file_version The version of the index file. Default is "V3". + - precomputed_partition_artifact_uri + An advanced input produced by an external backend such as + `lance-cuvs`. When set, Lance skips its own partition assignment + and consumes the precomputed partition-local artifact during + finalization. Requires `ivf_centroids` and `pq_codebook`. Optional parameters for `IVF_RQ`: @@ -3398,13 +3357,8 @@ def create_index( Experimental Accelerator (GPU) support: - *accelerate*: use GPU to train IVF partitions. - `accelerator="cuda"` and `accelerator="mps"` use the existing torch - path. `accelerator="cuda"` runs on NVIDIA GPUs and `accelerator="mps"` - runs on Apple Silicon GPUs. `accelerator="cuvs"` uses cuVS for IVF_PQ - training only and requires an NVIDIA GPU. - - The torch path requires PyTorch. The cuVS path requires the cuVS - Python bindings to be installed separately. + Only supports CUDA (Nvidia) or MPS (Apple) currently. + Requires PyTorch being installed. .. code-block:: python @@ -3419,10 +3373,9 @@ def create_index( accelerator="cuda" ) - Note: accelerator support is currently limited to the ``IVF_PQ`` index type. - Providing ``accelerator="cuda"`` for other index types will fall back to CPU - index building. Providing ``accelerator="cuvs"`` for other index types will - raise an error. + Note: GPU acceleration is currently supported only for the ``IVF_PQ`` index + type. Providing an accelerator for other index types will fall back to CPU + index building. References ---------- diff --git a/python/python/lance/indices/builder.py b/python/python/lance/indices/builder.py index 00591ead934..a13e92faf8f 100644 --- a/python/python/lance/indices/builder.py +++ b/python/python/lance/indices/builder.py @@ -8,8 +8,6 @@ import numpy as np import pyarrow as pa - -from lance.cuvs import is_cuvs_accelerator, prepare_global_ivf_pq_on_cuvs from lance.indices.ivf import IvfModel from lance.indices.pq import PqModel @@ -116,10 +114,11 @@ def train_ivf( self._verify_ivf_sample_rate(sample_rate, num_partitions, num_rows) distance_type = self._normalize_distance_type(distance_type) self._verify_ivf_params(num_partitions) - if is_cuvs_accelerator(accelerator): - raise NotImplementedError( - "IndicesBuilder.train_ivf does not support accelerator='cuvs'; " - "use prepare_global_ivf_pq instead" + if isinstance(accelerator, str) and accelerator.lower().startswith("cuvs"): + raise ValueError( + "accelerator='cuvs' is not built into Lance. " + "Use the external 'lance-cuvs' package to build training outputs " + "and partition artifacts." ) if accelerator is None: @@ -256,23 +255,11 @@ def prepare_global_ivf_pq( `IndicesBuilder.train_pq` (indices.train_pq_model). No public method names elsewhere are changed. """ - if is_cuvs_accelerator(accelerator): - if fragment_ids is not None: - raise NotImplementedError( - "fragment_ids is not supported with accelerator='cuvs'" - ) - num_rows = self._count_rows() - num_partitions = self._determine_num_partitions(num_partitions, num_rows) - num_subvectors = self._normalize_pq_params(num_subvectors, self.dimension) - return prepare_global_ivf_pq_on_cuvs( - self.dataset, - self.column[0], - num_partitions, - num_subvectors, - distance_type=distance_type, - accelerator=accelerator, - sample_rate=sample_rate, - max_iters=max_iters, + if isinstance(accelerator, str) and accelerator.lower().startswith("cuvs"): + raise ValueError( + "accelerator='cuvs' is not built into Lance. " + "Use the external 'lance-cuvs' package to build training outputs " + "and partition artifacts." ) # Global IVF training diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi index d377f381246..f0be29f39ca 100644 --- a/python/python/lance/lance/__init__.pyi +++ b/python/python/lance/lance/__init__.pyi @@ -135,20 +135,6 @@ class LanceFileSession: def upload_file(self, local_path: str, remote_path: str) -> None: ... def download_file(self, remote_path: str, local_path: str) -> None: ... -class PartitionArtifactBuilder: - def __init__( - self, - uri_or_path: str, - num_partitions: int, - pq_code_width: int, - storage_options: Optional[Dict[str, str]] = None, - storage_options_provider: Optional[StorageOptionsProvider] = None, - ): ... - def append_batch(self, batch: pa.RecordBatch) -> None: ... - def finish( - self, metadata_file: str, total_loss: Optional[float] = None - ) -> List[str]: ... - class LanceFileReader: def __init__( self, diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 9aba519fa9c..9606c91a724 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright The Lance Authors import logging -import json import os import platform import random @@ -10,29 +9,21 @@ import string import tempfile import time -from importlib import import_module -from pathlib import Path from typing import Optional import lance -import lance.cuvs as lance_cuvs import numpy as np import pyarrow as pa import pyarrow.compute as pc import pytest from lance import LanceDataset, LanceFragment from lance.dataset import VectorIndexReader -from lance.file import LanceFileReader from lance.indices import IndexFileVersion, IndicesBuilder from lance.query import MatchQuery, PhraseQuery from lance.util import validate_vector_index # noqa: E402 from lance.vector import vec_to_table # noqa: E402 -def _disable_rust_cuvs_backend(monkeypatch): - del monkeypatch - - def create_table(nvec=1000, ndim=128, nans=0, nullify=False, dtype=np.float32): mat = np.random.randn(nvec, ndim) if nans > 0: @@ -543,357 +534,46 @@ def test_create_index_accelerator_fallback(tmp_path, caplog): ) -def test_create_index_cuvs_dispatch(tmp_path, monkeypatch): - tbl = create_table(nvec=512, ndim=128) - dataset = lance.write_dataset(tbl, tmp_path) - calls = {} - - def fake_build( - dataset_arg, - column, - metric_type, - accelerator, - num_partitions, - num_sub_vectors, - **kwargs, - ): - calls["dataset"] = dataset_arg - calls["column"] = column - calls["num_partitions"] = num_partitions - calls["metric_type"] = metric_type - calls["accelerator"] = accelerator - calls["num_sub_vectors"] = num_sub_vectors - calls["kwargs"] = kwargs - return str(tmp_path / "cuvs_artifact"), [ - "manifest.json", - "metadata.lance", - "partitions/bucket-00000.lance", - ], np.random.randn(num_partitions, 128).astype(np.float32), np.random.randn( - num_sub_vectors, 256, 128 // num_sub_vectors - ).astype(np.float32) - - monkeypatch.setattr(lance_cuvs, "build_vector_index_on_cuvs", fake_build) - - dataset = dataset.create_index( - "vector", - index_type="IVF_PQ", - num_partitions=4, - num_sub_vectors=16, - accelerator="cuvs", - ) - - assert calls["column"] == "vector" - assert calls["num_partitions"] == 4 - assert calls["metric_type"] == "L2" - assert calls["accelerator"] == "cuvs" - assert calls["num_sub_vectors"] == 16 - assert calls["kwargs"]["sample_rate"] == 256 - assert calls["kwargs"]["max_iters"] == 50 - assert calls["kwargs"]["num_bits"] == 8 - assert calls["kwargs"]["batch_size"] == 1024 * 128 - assert calls["kwargs"]["filter_nan"] is True - assert dataset.stats.index_stats("vector_idx")["index_type"] == "IVF_PQ" - - -def test_create_index_cuvs_rejects_non_ivf_pq(tmp_path): +def test_create_index_rejects_cuvs_accelerator(tmp_path): tbl = create_table() dataset = lance.write_dataset(tbl, tmp_path) - with pytest.raises(ValueError, match="only supports IVF_PQ"): + with pytest.raises(ValueError, match="not built into Lance"): dataset.create_index( "vector", - index_type="IVF_FLAT", + index_type="IVF_PQ", num_partitions=4, + num_sub_vectors=16, accelerator="cuvs", ) -def test_prepare_global_ivf_pq_cuvs_dispatch(tmp_path, monkeypatch): - ds = _make_sample_dataset_base(tmp_path, "cuvs_prepare_ds", 512, 128) +def test_prepare_global_ivf_pq_rejects_cuvs_accelerator(tmp_path): + ds = _make_sample_dataset_base(tmp_path, "prepare_ivf_pq_cuvs_ds", 512, 128) builder = IndicesBuilder(ds, "vector") - builder_module = import_module("lance.indices.builder") - calls = {} - - def fake_prepare( - dataset_arg, - column, - num_partitions, - num_sub_vectors, - *, - distance_type, - accelerator, - sample_rate, - max_iters, - ): - calls["dataset"] = dataset_arg - calls["column"] = column - calls["num_partitions"] = num_partitions - calls["num_sub_vectors"] = num_sub_vectors - calls["distance_type"] = distance_type - calls["accelerator"] = accelerator - calls["sample_rate"] = sample_rate - calls["max_iters"] = max_iters - return { - "ivf_centroids": np.random.randn(num_partitions, 128).astype(np.float32), - "pq_codebook": np.random.randn( - num_sub_vectors, 256, 128 // num_sub_vectors - ).astype(np.float32), - } - - monkeypatch.setattr(builder_module, "prepare_global_ivf_pq_on_cuvs", fake_prepare) - - prepared = builder.prepare_global_ivf_pq( - num_partitions=4, - num_subvectors=16, - distance_type="l2", - accelerator="cuvs", - sample_rate=7, - max_iters=20, - ) - - assert calls["column"] == "vector" - assert calls["num_partitions"] == 4 - assert calls["num_sub_vectors"] == 16 - assert calls["distance_type"] == "l2" - assert calls["accelerator"] == "cuvs" - assert prepared["ivf_centroids"].shape == (4, 128) - assert prepared["pq_codebook"].shape == (16, 256, 8) - - -def test_train_ivf_pq_on_cuvs_nullable_vectors(tmp_path, monkeypatch): - _disable_rust_cuvs_backend(monkeypatch) - tbl = create_table(nvec=32, ndim=16, nullify=True) - dataset = lance.write_dataset(tbl, tmp_path) - - class FakeIndex: - centers = np.random.randn(4, 16).astype(np.float32) - pq_centers = np.random.randn(4, 256, 4).astype(np.float32) - - class FakeIvfPqModule: - class IndexParams: - def __init__(self, **kwargs): - self.kwargs = kwargs - - @staticmethod - def build(build_params, matrix): - assert build_params.kwargs["n_lists"] == 4 - assert matrix.shape[1] == 16 - assert matrix.dtype == np.float32 - return FakeIndex() - - monkeypatch.setattr(lance_cuvs, "_require_cuvs", lambda: FakeIvfPqModule()) - - centroids, pq_codebook = lance_cuvs.train_ivf_pq_on_cuvs( - dataset, - "vector", - 4, - "L2", - "cuvs", - 4, - sample_rate=4, - ) - - assert centroids.shape == (4, 16) - assert pq_codebook.shape == (4, 256, 4) - - -def test_train_ivf_pq_on_cuvs_uses_num_sub_vectors_for_pq_dim( - tmp_path, monkeypatch -): - _disable_rust_cuvs_backend(monkeypatch) - dataset = lance.write_dataset(create_table(nvec=32, ndim=16), tmp_path) - calls = {} - - class FakeIndex: - centers = np.random.randn(4, 16).astype(np.float32) - pq_centers = np.random.randn(2, 256, 8).astype(np.float32) - - class FakeIvfPqModule: - class IndexParams: - def __init__(self, **kwargs): - calls.update(kwargs) - - @staticmethod - def build(build_params, matrix): - assert matrix.shape[1] == 16 - return FakeIndex() - - monkeypatch.setattr(lance_cuvs, "_require_cuvs", lambda: FakeIvfPqModule()) - - centroids, pq_codebook = lance_cuvs.train_ivf_pq_on_cuvs( - dataset, - "vector", - 4, - "l2", - "cuvs", - 2, - sample_rate=4, - ) - - assert calls["pq_dim"] == 2 - assert centroids.shape == (4, 16) - assert pq_codebook.shape == (2, 256, 8) - - -def test_normalize_pq_codebook_accepts_subvector_dim_first_layout(): - class FakeIndex: - pq_centers = np.random.randn(8, 16, 256).astype(np.float32) - - pq_codebook = lance_cuvs._normalize_pq_codebook( - FakeIndex(), num_sub_vectors=16, num_bits=8, dimension=128 - ) - - assert pq_codebook.shape == (16, 256, 8) - - -def test_cuvs_as_numpy_prefers_copy_to_host(): - class FakeDeviceTensor: - def copy_to_host(self): - return np.arange(6, dtype=np.float32).reshape(2, 3) - - array = lance_cuvs._as_numpy(FakeDeviceTensor()) - - assert isinstance(array, np.ndarray) - assert array.shape == (2, 3) - assert array.dtype == np.float32 - - -def test_annotate_precomputed_encoded_dataset_scans_fragment_directly(tmp_path): - dataset_uri = tmp_path / "encoded_dataset" - - def make_table(partition_ids: list[int], row_id_start: int): - part_ids = np.asarray(partition_ids, dtype=np.uint32) - row_ids = pa.array( - np.arange(row_id_start, row_id_start + len(partition_ids), dtype=np.uint64) - ) - pq_values = pa.array(np.zeros(len(partition_ids) * 4, dtype=np.uint8)) - pq_codes = pa.FixedSizeListArray.from_arrays(pq_values, 4) - return pa.Table.from_arrays( - [row_ids, pa.array(part_ids), pq_codes], - names=["row_id", "__ivf_part_id", "__pq_code"], + with pytest.raises(ValueError, match="not built into Lance"): + builder.prepare_global_ivf_pq( + num_partitions=4, + num_subvectors=16, + distance_type="l2", + accelerator="cuvs", + sample_rate=7, + max_iters=20, ) - ds = lance.write_dataset(make_table([0, 1, 1, 0], 0), dataset_uri) - ds = lance.write_dataset(make_table([2, 3, 2, 3], 4), dataset_uri, mode="append") - - lance_cuvs._annotate_precomputed_encoded_dataset(ds, [2, 2, 2, 2]) - - metadata = ds.metadata() - partition_fragments = json.loads( - metadata[ - lance_cuvs.PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY - ] - ) - assert partition_fragments == [[0], [0], [1], [1]] - - -def test_one_pass_assign_ivf_pq_on_cuvs_writes_partition_artifact(tmp_path, monkeypatch): - _disable_rust_cuvs_backend(monkeypatch) - tbl = create_table(nvec=32, ndim=16) - dataset = lance.write_dataset(tbl, tmp_path / "cuvs_assign_src") - - ivf_centroids = np.random.randn(4, 16).astype(np.float32) - pq_codebook = np.random.randn(4, 256, 4).astype(np.float32) - - class FakeDeviceTensor: - def __init__(self, array): - self._array = array - def copy_to_host(self): - return self._array +def test_create_index_rejects_missing_precomputed_partition_artifact(tmp_path): + dataset = lance.write_dataset(create_table(nvec=64, ndim=128), tmp_path / "artifact_src") - class FakeCupyArray: - def __init__(self, array): - self.array = array - - class FakeCupyModule: - @staticmethod - def asarray(array): - return FakeCupyArray(array) - - class FakeIndex: - pq_dim = 4 - pq_bits = 8 - - class FakeIvfPqModule: - @staticmethod - def transform(index, vectors): - assert isinstance(index, FakeIndex) - assert isinstance(vectors, FakeCupyArray) - labels = np.arange(len(vectors.array), dtype=np.uint32) % 4 - pq_codes = np.full((len(vectors.array), 4), 7, dtype=np.uint8) - return FakeDeviceTensor(labels), FakeDeviceTensor(pq_codes) - - monkeypatch.setattr(lance_cuvs, "_require_cuvs", lambda: FakeIvfPqModule()) - monkeypatch.setattr(lance_cuvs, "_optional_cupy", lambda: FakeCupyModule()) - - artifact_root, artifact_files = lance_cuvs.one_pass_assign_ivf_pq_on_cuvs( - dataset, - "vector", - "l2", - "cuvs", - ivf_centroids, - pq_codebook, - trained_index=FakeIndex(), - batch_size=8, - ) - - manifest_path = Path(artifact_root) / lance_cuvs.PARTITION_ARTIFACT_MANIFEST_FILE_NAME - metadata_path = Path(artifact_root) / lance_cuvs.PARTITION_ARTIFACT_METADATA_FILE_NAME - - assert manifest_path.exists() - assert metadata_path.exists() - assert any(path.endswith(".lance") for path in artifact_files) - - manifest = json.loads(manifest_path.read_text()) - assert manifest["version"] == lance_cuvs.PARTITION_ARTIFACT_MANIFEST_VERSION - assert manifest["num_partitions"] == 4 - assert manifest["metadata_file"] == lance_cuvs.PARTITION_ARTIFACT_METADATA_FILE_NAME - assert [entry["num_rows"] for entry in manifest["partitions"]] == [8, 8, 8, 8] - assert all(entry["path"] for entry in manifest["partitions"]) - assert all(entry["ranges"] for entry in manifest["partitions"]) - - metadata_reader = LanceFileReader(str(metadata_path)) - metadata_table = metadata_reader.read_all().to_table() - assert metadata_table.column("_ivf_centroids").type == pa.list_(pa.list_(pa.float32(), 16)) - assert metadata_table.column("_pq_codebook").type == pa.list_(pa.list_(pa.float32(), 4)) - - bucket_path = Path(artifact_root) / manifest["partitions"][0]["path"] - bucket_reader = LanceFileReader(str(bucket_path)) - bucket_table = bucket_reader.read_all().to_table() - assert bucket_table.column("_rowid").type == pa.uint64() - assert bucket_table.column("__pq_code").type == pa.list_(pa.uint8(), 4) - - -def test_one_pass_assign_ivf_pq_on_cuvs_rejects_incompatible_transform_width( - tmp_path, - monkeypatch, -): - _disable_rust_cuvs_backend(monkeypatch) - tbl = create_table(nvec=32, ndim=128) - dataset = lance.write_dataset(tbl, tmp_path / "cuvs_assign_incompatible") - - ivf_centroids = np.random.randn(4, 128).astype(np.float32) - pq_codebook = np.random.randn(16, 256, 8).astype(np.float32) - monkeypatch.setattr(lance_cuvs, "_require_cuvs", lambda: object()) - - class FakeIndex: - pq_dim = 8 - pq_bits = 8 - - with pytest.raises( - ValueError, - match="cuVS transform output is incompatible with Lance IVF_PQ", - ): - lance_cuvs.one_pass_assign_ivf_pq_on_cuvs( - dataset, + with pytest.raises(Exception): + dataset.create_index( "vector", - "l2", - "cuvs", - ivf_centroids, - pq_codebook, - trained_index=FakeIndex(), - batch_size=8, + index_type="IVF_PQ", + num_partitions=4, + num_sub_vectors=16, + ivf_centroids=np.random.randn(4, 128).astype(np.float32), + pq_codebook=np.random.randn(16, 256, 8).astype(np.float32), + precomputed_partition_artifact_uri=str(tmp_path / "missing_artifact"), ) diff --git a/python/src/dataset.rs b/python/src/dataset.rs index 19c3e4ec5d4..0a0342019db 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -3371,10 +3371,6 @@ fn prepare_vector_index_params( ivf_params.precomputed_partitions_file = Some(f.to_string()); }; - if let Some(uri) = kwargs.get_item("precomputed_encoded_dataset_uri")? { - ivf_params.precomputed_encoded_dataset_uri = Some(uri.to_string()); - }; - if let Some(uri) = kwargs.get_item("precomputed_partition_artifact_uri")? { ivf_params.precomputed_partition_artifact_uri = Some(uri.to_string()); }; diff --git a/python/src/file.rs b/python/src/file.rs index eb830dc4a73..da8ba3e76bb 100644 --- a/python/src/file.rs +++ b/python/src/file.rs @@ -18,7 +18,6 @@ use arrow_array::{RecordBatch, RecordBatchReader, UInt32Array}; use arrow_schema::Schema as ArrowSchema; use bytes::Bytes; use futures::stream::StreamExt; -use lance::index::vector::PartitionArtifactBuilder as CorePartitionArtifactBuilder; use lance::io::{ObjectStore, RecordBatchStream}; use lance_core::cache::LanceCache; use lance_core::utils::path::LancePathExt; @@ -371,82 +370,6 @@ impl Drop for LanceFileWriter { } } -#[pyclass] -pub struct PartitionArtifactBuilder { - inner: Arc>, -} - -impl PartitionArtifactBuilder { - #[allow(clippy::too_many_arguments)] - async fn open( - uri_or_path: String, - num_partitions: usize, - pq_code_width: usize, - storage_options: Option>, - storage_options_provider: Option>, - ) -> PyResult { - let (object_store, path) = object_store_from_uri_or_path_with_provider( - uri_or_path, - storage_options, - storage_options_provider, - ) - .await?; - let inner = CorePartitionArtifactBuilder::try_new_with_store( - object_store, - path, - num_partitions, - pq_code_width, - ) - .infer_error()?; - Ok(Self { - inner: Arc::new(Mutex::new(inner)), - }) - } -} - -#[pymethods] -impl PartitionArtifactBuilder { - #[new] - #[pyo3(signature=(uri_or_path, num_partitions, pq_code_width, storage_options=None, storage_options_provider=None))] - #[allow(clippy::too_many_arguments)] - pub fn new( - uri_or_path: String, - num_partitions: usize, - pq_code_width: usize, - storage_options: Option>, - storage_options_provider: Option<&Bound<'_, PyAny>>, - ) -> PyResult { - let provider = storage_options_provider - .map(crate::storage_options::py_object_to_storage_options_provider) - .transpose()?; - rt().block_on( - None, - Self::open( - uri_or_path, - num_partitions, - pq_code_width, - storage_options, - provider, - ), - )? - } - - pub fn append_batch(&self, batch: PyArrowType) -> PyResult<()> { - rt().runtime.block_on(async { - self.inner.lock().await.append_batch(&batch.0).await - }) - .infer_error() - } - - #[pyo3(signature=(metadata_file, total_loss=None))] - pub fn finish(&self, metadata_file: String, total_loss: Option) -> PyResult> { - rt().runtime.block_on(async { - self.inner.lock().await.finish(&metadata_file, total_loss).await - }) - .infer_error() - } -} - pub async fn object_store_from_uri_or_path_no_options( uri_or_path: impl AsRef, ) -> PyResult<(Arc, Path)> { diff --git a/python/src/lib.rs b/python/src/lib.rs index 819e3fddc3e..9730f2ba1c5 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -51,8 +51,7 @@ use dataset::{ use env_logger::{Builder, Env}; use file::{ LanceBufferDescriptor, LanceColumnMetadata, LanceFileMetadata, LanceFileReader, - LanceFileStatistics, LanceFileWriter, LancePageMetadata, PartitionArtifactBuilder, - stable_version, + LanceFileStatistics, LanceFileWriter, LancePageMetadata, stable_version, }; use log::Level; use pyo3::exceptions::PyIOError; @@ -259,7 +258,6 @@ fn lance(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; - m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/rust/lance-index/src/vector/ivf/builder.rs b/rust/lance-index/src/vector/ivf/builder.rs index 155b33f58b9..9dfcd675be8 100644 --- a/rust/lance-index/src/vector/ivf/builder.rs +++ b/rust/lance-index/src/vector/ivf/builder.rs @@ -48,11 +48,6 @@ pub struct IvfBuildParams { /// The input is expected to be (/dir/to/buffers, [buffer1.lance, buffer2.lance, ...]) pub precomputed_shuffle_buffers: Option<(String, Vec)>, - /// Precomputed encoded dataset (_rowid/row_id -> partition_id, pq_code). - /// Mutually exclusive with `precomputed_partitions_file` and `precomputed_shuffle_buffers`. - /// Requires `centroids` to be set. - pub precomputed_encoded_dataset_uri: Option, - /// Precomputed partitioned artifact produced by an external backend. /// Mutually exclusive with other precomputed inputs and requires `centroids` to be set. pub precomputed_partition_artifact_uri: Option, @@ -76,7 +71,6 @@ impl Default for IvfBuildParams { sample_rate: 256, // See faiss precomputed_partitions_file: None, precomputed_shuffle_buffers: None, - precomputed_encoded_dataset_uri: None, precomputed_partition_artifact_uri: None, shuffle_partition_batches: 1024 * 10, shuffle_partition_concurrency: 2, diff --git a/rust/lance/src/index/vector.rs b/rust/lance/src/index/vector.rs index 48235a65582..684f12b2d96 100644 --- a/rust/lance/src/index/vector.rs +++ b/rust/lance/src/index/vector.rs @@ -8,7 +8,6 @@ use std::sync::Arc; use std::{any::Any, collections::HashMap}; pub mod builder; -mod encoded_dataset; pub mod ivf; mod partition_artifact; pub mod pq; @@ -1658,7 +1657,6 @@ fn derive_ivf_params(ivf_model: &IvfModel) -> IvfBuildParams { sample_rate: 256, // Default precomputed_partitions_file: None, precomputed_shuffle_buffers: None, - precomputed_encoded_dataset_uri: None, precomputed_partition_artifact_uri: None, shuffle_partition_batches: 1024 * 10, // Default shuffle_partition_concurrency: 2, // Default diff --git a/rust/lance/src/index/vector/builder.rs b/rust/lance/src/index/vector/builder.rs index 9c9d4b16eed..b753caadc67 100644 --- a/rust/lance/src/index/vector/builder.rs +++ b/rust/lance/src/index/vector/builder.rs @@ -92,7 +92,6 @@ use crate::index::vector::utils::{infer_vector_dim, infer_vector_element_type}; use super::v2::IVFIndex; use super::{ - encoded_dataset::EncodedDatasetShuffleReader, ivf::load_precomputed_partitions_if_available, partition_artifact::PartitionArtifactShuffleReader, utils::{self, get_vector_type}, @@ -228,19 +227,6 @@ impl IvfIndexBuilder )) } - async fn try_open_precomputed_encoded_dataset_reader( - &self, - uri: &str, - ) -> Result> { - let storage_options = self - .ivf_params - .as_ref() - .and_then(|params| params.storage_options.as_ref()); - Ok(Arc::new( - EncodedDatasetShuffleReader::try_open(uri, storage_options).await?, - )) - } - async fn try_open_precomputed_partition_artifact_reader( &self, uri: &str, @@ -649,19 +635,6 @@ impl IvfIndexBuilder return Ok(()); } - if let Some(uri) = self - .ivf_params - .as_ref() - .and_then(|params| params.precomputed_encoded_dataset_uri.as_deref()) - { - log::info!("shuffle with precomputed encoded dataset from {}", uri); - self.shuffle_reader = Some( - self.try_open_precomputed_encoded_dataset_reader(uri) - .await?, - ); - return Ok(()); - } - let stream = match self .ivf_params .as_ref() diff --git a/rust/lance/src/index/vector/encoded_dataset.rs b/rust/lance/src/index/vector/encoded_dataset.rs deleted file mode 100644 index 866f903805c..00000000000 --- a/rust/lance/src/index/vector/encoded_dataset.rs +++ /dev/null @@ -1,370 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright The Lance Authors - -use std::collections::HashMap; -use std::sync::Arc; - -use arrow_schema::Fields; -use futures::StreamExt; -use lance_core::utils::tokio::get_num_compute_intensive_cpus; -use lance_core::{Error, ROW_ID, Result}; -use lance_index::vector::v3::shuffler::ShuffleReader; -use lance_index::vector::{PART_ID_COLUMN, PQ_CODE_COLUMN}; -use lance_io::stream::{RecordBatchStream, RecordBatchStreamAdapter}; -use lance_table::format::Fragment; -use log::warn; -use serde::de::DeserializeOwned; - -use crate::Dataset; -use crate::dataset::builder::DatasetBuilder; - -pub(crate) const PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY: &str = - "lance:index_build:precomputed_encoded_partition_sizes"; -pub(crate) const PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY: &str = - "lance:index_build:precomputed_encoded_partition_fragment_ids"; -pub(crate) const PRECOMPUTED_ENCODED_TOTAL_LOSS_METADATA_KEY: &str = - "lance:index_build:precomputed_encoded_total_loss"; - -const PRECOMPUTED_ROW_ID_COLUMN: &str = "row_id"; - -pub(crate) struct EncodedDatasetShuffleReader { - dataset: Dataset, - row_id_column: String, - partition_sizes: Vec, - partition_fragments: Option>>, - total_loss: Option, -} - -impl EncodedDatasetShuffleReader { - pub(crate) async fn try_open( - uri: &str, - storage_options: Option<&HashMap>, - ) -> Result { - let mut builder = DatasetBuilder::from_uri(uri); - if let Some(storage_options) = storage_options { - builder = builder.with_storage_options(storage_options.clone()); - } - let dataset = builder.load().await?; - Self::try_new(dataset) - } - - pub(crate) fn try_new(dataset: Dataset) -> Result { - let row_id_column = if dataset.schema().field(ROW_ID).is_some() { - ROW_ID.to_string() - } else if dataset.schema().field(PRECOMPUTED_ROW_ID_COLUMN).is_some() { - PRECOMPUTED_ROW_ID_COLUMN.to_string() - } else { - return Err(Error::invalid_input(format!( - "precomputed encoded dataset must contain '{}' or '{}' column", - ROW_ID, PRECOMPUTED_ROW_ID_COLUMN - ))); - }; - - for required_column in [PART_ID_COLUMN, PQ_CODE_COLUMN] { - if dataset.schema().field(required_column).is_none() { - return Err(Error::invalid_input(format!( - "precomputed encoded dataset is missing required column '{}'", - required_column - ))); - } - } - - let metadata = dataset.metadata(); - let partition_sizes: Vec = - parse_required_metadata(metadata, PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY)?; - - let partition_fragments = parse_optional_metadata::>>( - metadata, - PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY, - )? - .map(|partition_fragment_ids| resolve_partition_fragments(&dataset, partition_fragment_ids)) - .transpose()?; - - if let Some(partition_fragments) = partition_fragments.as_ref() { - if partition_fragments.len() != partition_sizes.len() { - return Err(Error::invalid_input(format!( - "metadata '{}' has {} partitions but '{}' has {}", - PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY, - partition_fragments.len(), - PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY, - partition_sizes.len(), - ))); - } - } - - let total_loss = - parse_optional_metadata::(metadata, PRECOMPUTED_ENCODED_TOTAL_LOSS_METADATA_KEY)?; - - Ok(Self { - dataset, - row_id_column, - partition_sizes, - partition_fragments, - total_loss, - }) - } - - fn rename_row_id( - stream: impl RecordBatchStream + Unpin + 'static, - row_id_idx: usize, - ) -> impl RecordBatchStream + Unpin + 'static { - let new_schema = Arc::new(arrow_schema::Schema::new( - stream - .schema() - .fields - .iter() - .enumerate() - .map(|(field_idx, field)| { - if field_idx == row_id_idx { - arrow_schema::Field::new( - ROW_ID, - field.data_type().clone(), - field.is_nullable(), - ) - } else { - field.as_ref().clone() - } - }) - .collect::(), - )); - RecordBatchStreamAdapter::new( - new_schema.clone(), - stream.map(move |batch| match batch { - Ok(batch) => { - arrow_array::RecordBatch::try_new(new_schema.clone(), batch.columns().to_vec()) - .map_err(Error::from) - } - Err(error) => Err(error), - }), - ) - } -} - -#[async_trait::async_trait] -impl ShuffleReader for EncodedDatasetShuffleReader { - async fn read_partition( - &self, - partition_id: usize, - ) -> Result>> { - if partition_id >= self.partition_sizes.len() { - return Ok(None); - } - if self.partition_sizes[partition_id] == 0 { - return Ok(None); - } - - let mut scanner = self.dataset.scan(); - scanner.batch_readahead(get_num_compute_intensive_cpus()); - scanner.project(&[self.row_id_column.as_str(), PART_ID_COLUMN, PQ_CODE_COLUMN])?; - - if let Some(partition_fragments) = self.partition_fragments.as_ref() { - let fragments = &partition_fragments[partition_id]; - if fragments.is_empty() { - warn!( - "precomputed encoded dataset metadata has no fragments for non-empty partition {}, falling back to filtered scan", - partition_id - ); - } else { - scanner.with_fragments(fragments.clone()); - } - } - - scanner.filter(&format!("{PART_ID_COLUMN} = {partition_id}"))?; - let stream = scanner.try_into_stream().await?; - if let Some((row_id_idx, _)) = stream.schema().column_with_name(PRECOMPUTED_ROW_ID_COLUMN) { - Ok(Some(Box::new(Self::rename_row_id(stream, row_id_idx)))) - } else { - Ok(Some(Box::new(stream))) - } - } - - fn partition_size(&self, partition_id: usize) -> Result { - Ok(self.partition_sizes.get(partition_id).copied().unwrap_or(0)) - } - - fn total_loss(&self) -> Option { - self.total_loss - } -} - -fn parse_required_metadata( - metadata: &HashMap, - key: &str, -) -> Result { - let value = metadata.get(key).ok_or_else(|| { - Error::invalid_input(format!( - "precomputed encoded dataset is missing required metadata '{}'", - key - )) - })?; - parse_metadata_value(value, key) -} - -fn parse_optional_metadata( - metadata: &HashMap, - key: &str, -) -> Result> { - metadata - .get(key) - .map(|value| parse_metadata_value(value, key)) - .transpose() -} - -fn parse_metadata_value(value: &str, key: &str) -> Result { - serde_json::from_str(value).map_err(|error| { - Error::invalid_input(format!( - "failed to parse precomputed encoded dataset metadata '{}' from '{}': {}", - key, value, error - )) - }) -} - -fn resolve_partition_fragments( - dataset: &Dataset, - partition_fragment_ids: Vec>, -) -> Result>> { - let fragments_by_id = dataset - .fragments() - .iter() - .cloned() - .map(|fragment| (fragment.id, fragment)) - .collect::>(); - - partition_fragment_ids - .into_iter() - .map(|fragment_ids| { - fragment_ids - .into_iter() - .map(|fragment_id| { - fragments_by_id.get(&fragment_id).cloned().ok_or_else(|| { - Error::invalid_input(format!( - "precomputed encoded dataset metadata references unknown fragment id {}", - fragment_id - )) - }) - }) - .collect() - }) - .collect() -} - -#[cfg(test)] -mod tests { - use super::*; - - use arrow_array::{ - FixedSizeListArray, RecordBatch, RecordBatchIterator, UInt8Array, UInt32Array, UInt64Array, - cast::AsArray, - }; - use futures::TryStreamExt; - use lance_arrow::FixedSizeListArrayExt; - - use crate::dataset::WriteParams; - - #[tokio::test] - async fn encoded_dataset_reader_reads_mapped_fragments_and_renames_row_id() { - let schema = Arc::new(arrow_schema::Schema::new(vec![ - arrow_schema::Field::new("row_id", arrow_schema::DataType::UInt64, false), - arrow_schema::Field::new(PART_ID_COLUMN, arrow_schema::DataType::UInt32, false), - arrow_schema::Field::new( - PQ_CODE_COLUMN, - arrow_schema::DataType::FixedSizeList( - Arc::new(arrow_schema::Field::new( - "item", - arrow_schema::DataType::UInt8, - true, - )), - 2, - ), - true, - ), - ])); - - let batch1 = RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(UInt64Array::from(vec![10_u64, 11])), - Arc::new(UInt32Array::from(vec![0_u32, 1])), - Arc::new( - FixedSizeListArray::try_new_from_values(UInt8Array::from(vec![1, 2, 3, 4]), 2) - .unwrap(), - ), - ], - ) - .unwrap(); - let batch2 = RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(UInt64Array::from(vec![12_u64, 13])), - Arc::new(UInt32Array::from(vec![1_u32, 1])), - Arc::new( - FixedSizeListArray::try_new_from_values(UInt8Array::from(vec![5, 6, 7, 8]), 2) - .unwrap(), - ), - ], - ) - .unwrap(); - - let reader = RecordBatchIterator::new(vec![Ok(batch1), Ok(batch2)], schema); - let write_params = WriteParams { - max_rows_per_file: 2, - max_rows_per_group: 2, - ..Default::default() - }; - let mut dataset = Dataset::write( - reader, - "memory://precomputed-encoded-reader", - Some(write_params), - ) - .await - .unwrap(); - - let fragment_ids = dataset - .get_fragments() - .into_iter() - .map(|fragment| fragment.metadata().id) - .collect::>(); - assert_eq!(fragment_ids.len(), 2); - - dataset - .update_metadata(vec![ - ( - PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY.to_string(), - serde_json::to_string(&vec![1_usize, 3]).unwrap(), - ), - ( - PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY.to_string(), - serde_json::to_string(&vec![ - vec![fragment_ids[0] as u64], - vec![fragment_ids[0] as u64, fragment_ids[1] as u64], - ]) - .unwrap(), - ), - ]) - .await - .unwrap(); - - let reader = EncodedDatasetShuffleReader::try_new(dataset).unwrap(); - assert_eq!(reader.partition_size(0).unwrap(), 1); - assert_eq!(reader.partition_size(1).unwrap(), 3); - - let stream = reader.read_partition(1).await.unwrap().unwrap(); - let batches = stream.try_collect::>().await.unwrap(); - let row_ids = batches - .iter() - .flat_map(|batch| { - batch[ROW_ID] - .as_primitive::() - .values() - .iter() - .copied() - }) - .collect::>(); - assert_eq!(row_ids, vec![11, 12, 13]); - assert!( - batches - .iter() - .all(|batch| batch.column_by_name("row_id").is_none()) - ); - } -} diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs index 4841c98d661..9f51459d55f 100644 --- a/rust/lance/src/index/vector/ivf.rs +++ b/rust/lance/src/index/vector/ivf.rs @@ -1204,12 +1204,6 @@ fn sanity_check_ivf_params(ivf: &IvfBuildParams) -> Result<()> { )); } - if ivf.precomputed_encoded_dataset_uri.is_some() && ivf.centroids.is_none() { - return Err(Error::index( - "precomputed_encoded_dataset_uri requires centroids to be set".to_string(), - )); - } - if ivf.precomputed_partition_artifact_uri.is_some() && ivf.centroids.is_none() { return Err(Error::index( "precomputed_partition_artifact_uri requires centroids to be set".to_string(), @@ -1223,20 +1217,6 @@ fn sanity_check_ivf_params(ivf: &IvfBuildParams) -> Result<()> { )); } - if ivf.precomputed_encoded_dataset_uri.is_some() && ivf.precomputed_partitions_file.is_some() { - return Err(Error::index( - "precomputed_encoded_dataset_uri and precomputed_partitions_file are mutually exclusive" - .to_string(), - )); - } - - if ivf.precomputed_encoded_dataset_uri.is_some() && ivf.precomputed_shuffle_buffers.is_some() { - return Err(Error::index( - "precomputed_encoded_dataset_uri and precomputed_shuffle_buffers are mutually exclusive" - .to_string(), - )); - } - if ivf.precomputed_partition_artifact_uri.is_some() && ivf.precomputed_partitions_file.is_some() { return Err(Error::index( @@ -1253,15 +1233,6 @@ fn sanity_check_ivf_params(ivf: &IvfBuildParams) -> Result<()> { )); } - if ivf.precomputed_partition_artifact_uri.is_some() - && ivf.precomputed_encoded_dataset_uri.is_some() - { - return Err(Error::index( - "precomputed_partition_artifact_uri and precomputed_encoded_dataset_uri are mutually exclusive" - .to_string(), - )); - } - Ok(()) } @@ -1273,12 +1244,6 @@ fn sanity_check_params(ivf: &IvfBuildParams, pq: &PQBuildParams) -> Result<()> { )); } - if ivf.precomputed_encoded_dataset_uri.is_some() && pq.codebook.is_none() { - return Err(Error::index( - "precomputed_encoded_dataset_uri requires codebooks to be set".to_string(), - )); - } - if ivf.precomputed_partition_artifact_uri.is_some() && pq.codebook.is_none() { return Err(Error::index( "precomputed_partition_artifact_uri requires codebooks to be set".to_string(), From 5ea24989c238e3c0f8de8cf673aaf2e1f3a70213 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Wed, 8 Apr 2026 20:01:02 +0800 Subject: [PATCH 17/21] python: delegate cuvs acceleration to external backend --- python/python/lance/cuvs.py | 111 ++++++++++++++++ python/python/lance/dataset.py | 137 ++++++++++++-------- python/python/lance/indices/builder.py | 33 +++-- python/python/tests/test_vector_index.py | 155 +++++++++++++++++++++-- 4 files changed, 362 insertions(+), 74 deletions(-) create mode 100644 python/python/lance/cuvs.py diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py new file mode 100644 index 00000000000..ba7a1a67738 --- /dev/null +++ b/python/python/lance/cuvs.py @@ -0,0 +1,111 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +from __future__ import annotations + +import os +import tempfile +from importlib import import_module +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from pathlib import Path + + +def is_cuvs_accelerator(accelerator: object) -> bool: + return isinstance(accelerator, str) and accelerator.lower() == "cuvs" + + +def _require_lance_cuvs(): + try: + return import_module("lance_cuvs") + except ModuleNotFoundError as exc: + raise ModuleNotFoundError( + "accelerator='cuvs' requires the external 'lance-cuvs' package " + "to be installed." + ) from exc + + +def build_vector_index_on_cuvs( + dataset, + column: str, + metric_type: str, + accelerator: str, + num_partitions: int, + num_sub_vectors: int, + dst_dataset_uri: str | Path | None = None, + *, + sample_rate: int = 256, + max_iters: int = 50, + num_bits: int = 8, + batch_size: int = 1024 * 128, + filter_nan: bool = True, +): + if not is_cuvs_accelerator(accelerator): + raise ValueError("build_vector_index_on_cuvs requires accelerator='cuvs'") + + backend = _require_lance_cuvs() + artifact_uri = ( + os.fspath(dst_dataset_uri) + if dst_dataset_uri is not None + else tempfile.mkdtemp(prefix="lance-cuvs-artifact-") + ) + training = backend.train_ivf_pq( + dataset.uri, + column, + metric_type=metric_type, + num_partitions=num_partitions, + num_sub_vectors=num_sub_vectors, + sample_rate=sample_rate, + max_iters=max_iters, + num_bits=num_bits, + filter_nan=filter_nan, + ) + artifact = backend.build_ivf_pq_artifact( + dataset.uri, + column, + training=training, + artifact_uri=artifact_uri, + batch_size=batch_size, + filter_nan=filter_nan, + ) + return ( + artifact.artifact_uri, + artifact.files, + training.ivf_centroids(), + training.pq_codebook(), + ) + + +def prepare_global_ivf_pq_on_cuvs( + dataset, + column: str, + num_partitions: int, + num_sub_vectors: int, + *, + distance_type: str = "l2", + accelerator: str = "cuvs", + sample_rate: int = 256, + max_iters: int = 50, + num_bits: int = 8, + filter_nan: bool = True, +): + if not is_cuvs_accelerator(accelerator): + raise ValueError("prepare_global_ivf_pq_on_cuvs requires accelerator='cuvs'") + + backend = _require_lance_cuvs() + training = backend.train_ivf_pq( + dataset.uri, + column, + metric_type=distance_type, + num_partitions=num_partitions, + num_sub_vectors=num_sub_vectors, + sample_rate=sample_rate, + max_iters=max_iters, + num_bits=num_bits, + filter_nan=filter_nan, + ) + return { + "ivf_centroids": training.ivf_centroids(), + "pq_codebook": training.pq_codebook(), + } diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 32e9e548d68..7c3a7bcee06 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -39,6 +39,7 @@ from lance.log import LOGGER from .blob import BlobFile +from .cuvs import is_cuvs_accelerator from .dependencies import ( _check_for_numpy, _check_for_torch, @@ -2899,19 +2900,14 @@ def _create_index_impl( # Handle timing for various parts of accelerated builds timers = {} - if isinstance(accelerator, str) and accelerator.lower().startswith("cuvs"): - raise ValueError( - "accelerator='cuvs' is not built into Lance. " - "Use the external 'lance-cuvs' package to produce a " - "precomputed partition artifact and then call create_index " - "with precomputed_partition_artifact_uri." - ) + use_cuvs = is_cuvs_accelerator(accelerator) if accelerator is not None and index_type != "IVF_PQ": LOGGER.warning( "Index type %s does not support GPU acceleration; falling back to CPU", index_type, ) accelerator = None + use_cuvs = False # IMPORTANT: Distributed indexing is CPU-only. Enforce single-node when # accelerator or torch-related paths are detected. @@ -2960,52 +2956,79 @@ def _create_index_impl( num_partitions = _target_partition_size_to_num_partitions( num_rows, target_partition_size ) - from .vector import ( - one_pass_assign_ivf_pq_on_accelerator, - one_pass_train_ivf_pq_on_accelerator, - ) + if use_cuvs: + from .cuvs import build_vector_index_on_cuvs + + LOGGER.info("Doing cuVS vector backend build") + timers["ivf+pq_build:start"] = time.time() + artifact_root, _, ivf_centroids, pq_codebook = build_vector_index_on_cuvs( + self, + column[0], + metric, + accelerator, + num_partitions, + num_sub_vectors, + sample_rate=kwargs.get("sample_rate", 256), + max_iters=kwargs.get("max_iters", 50), + num_bits=kwargs.get("num_bits", 8), + batch_size=1024 * 128, + filter_nan=filter_nan, + ) + kwargs["precomputed_partition_artifact_uri"] = artifact_root + timers["ivf+pq_build:end"] = time.time() + ivfpq_build_time = ( + timers["ivf+pq_build:end"] - timers["ivf+pq_build:start"] + ) + LOGGER.info("cuVS ivf+pq build time: %ss", ivfpq_build_time) + else: + from .vector import ( + one_pass_assign_ivf_pq_on_accelerator, + one_pass_train_ivf_pq_on_accelerator, + ) - LOGGER.info("Doing one-pass ivfpq accelerated computations") - timers["ivf+pq_train:start"] = time.time() - ( - ivf_centroids, - ivf_kmeans, - pq_codebook, - pq_kmeans_list, - ) = one_pass_train_ivf_pq_on_accelerator( - self, - column[0], - num_partitions, - metric, - accelerator, - num_sub_vectors=num_sub_vectors, - batch_size=20480, - filter_nan=filter_nan, - ) - timers["ivf+pq_train:end"] = time.time() - ivfpq_train_time = timers["ivf+pq_train:end"] - timers["ivf+pq_train:start"] - LOGGER.info("ivf+pq training time: %ss", ivfpq_train_time) - timers["ivf+pq_assign:start"] = time.time() - shuffle_output_dir, shuffle_buffers = one_pass_assign_ivf_pq_on_accelerator( - self, - column[0], - metric, - accelerator, - ivf_kmeans, - pq_kmeans_list, - batch_size=20480, - filter_nan=filter_nan, - ) - timers["ivf+pq_assign:end"] = time.time() - ivfpq_assign_time = ( - timers["ivf+pq_assign:end"] - timers["ivf+pq_assign:start"] - ) - LOGGER.info("ivf+pq transform time: %ss", ivfpq_assign_time) + LOGGER.info("Doing one-pass ivfpq accelerated computations") + timers["ivf+pq_train:start"] = time.time() + ( + ivf_centroids, + ivf_kmeans, + pq_codebook, + pq_kmeans_list, + ) = one_pass_train_ivf_pq_on_accelerator( + self, + column[0], + num_partitions, + metric, + accelerator, + num_sub_vectors=num_sub_vectors, + batch_size=20480, + filter_nan=filter_nan, + ) + timers["ivf+pq_train:end"] = time.time() + ivfpq_train_time = ( + timers["ivf+pq_train:end"] - timers["ivf+pq_train:start"] + ) + LOGGER.info("ivf+pq training time: %ss", ivfpq_train_time) + timers["ivf+pq_assign:start"] = time.time() + shuffle_output_dir, shuffle_buffers = one_pass_assign_ivf_pq_on_accelerator( + self, + column[0], + metric, + accelerator, + ivf_kmeans, + pq_kmeans_list, + batch_size=20480, + filter_nan=filter_nan, + ) + timers["ivf+pq_assign:end"] = time.time() + ivfpq_assign_time = ( + timers["ivf+pq_assign:end"] - timers["ivf+pq_assign:start"] + ) + LOGGER.info("ivf+pq transform time: %ss", ivfpq_assign_time) - kwargs["precomputed_shuffle_buffers"] = shuffle_buffers - kwargs["precomputed_shuffle_buffers_path"] = os.path.join( - shuffle_output_dir, "data" - ) + kwargs["precomputed_shuffle_buffers"] = shuffle_buffers + kwargs["precomputed_shuffle_buffers_path"] = os.path.join( + shuffle_output_dir, "data" + ) if index_type.startswith("IVF"): if (ivf_centroids is not None) and (ivf_centroids_file is not None): raise ValueError( @@ -3243,7 +3266,12 @@ def create_index( The number of sub-vectors for PQ (Product Quantization). accelerator : str or ``torch.Device``, optional If set, use an accelerator to speed up the training process. - Accepted accelerator: "cuda" (Nvidia GPU) and "mps" (Apple Silicon GPU). + Accepted accelerator: + + - "cuda" (Nvidia GPU) + - "mps" (Apple Silicon GPU) + - "cuvs" for the external `lance-cuvs` backend + If not set, use the CPU. index_cache_size : int, optional The size of the index cache in number of entries. Default value is 256. @@ -3357,8 +3385,9 @@ def create_index( Experimental Accelerator (GPU) support: - *accelerate*: use GPU to train IVF partitions. - Only supports CUDA (Nvidia) or MPS (Apple) currently. - Requires PyTorch being installed. + Supports CUDA (Nvidia) and MPS (Apple) via the built-in torch path. + `accelerator="cuvs"` delegates IVF_PQ build preparation to the + external `lance-cuvs` package. .. code-block:: python diff --git a/python/python/lance/indices/builder.py b/python/python/lance/indices/builder.py index a13e92faf8f..00591ead934 100644 --- a/python/python/lance/indices/builder.py +++ b/python/python/lance/indices/builder.py @@ -8,6 +8,8 @@ import numpy as np import pyarrow as pa + +from lance.cuvs import is_cuvs_accelerator, prepare_global_ivf_pq_on_cuvs from lance.indices.ivf import IvfModel from lance.indices.pq import PqModel @@ -114,11 +116,10 @@ def train_ivf( self._verify_ivf_sample_rate(sample_rate, num_partitions, num_rows) distance_type = self._normalize_distance_type(distance_type) self._verify_ivf_params(num_partitions) - if isinstance(accelerator, str) and accelerator.lower().startswith("cuvs"): - raise ValueError( - "accelerator='cuvs' is not built into Lance. " - "Use the external 'lance-cuvs' package to build training outputs " - "and partition artifacts." + if is_cuvs_accelerator(accelerator): + raise NotImplementedError( + "IndicesBuilder.train_ivf does not support accelerator='cuvs'; " + "use prepare_global_ivf_pq instead" ) if accelerator is None: @@ -255,11 +256,23 @@ def prepare_global_ivf_pq( `IndicesBuilder.train_pq` (indices.train_pq_model). No public method names elsewhere are changed. """ - if isinstance(accelerator, str) and accelerator.lower().startswith("cuvs"): - raise ValueError( - "accelerator='cuvs' is not built into Lance. " - "Use the external 'lance-cuvs' package to build training outputs " - "and partition artifacts." + if is_cuvs_accelerator(accelerator): + if fragment_ids is not None: + raise NotImplementedError( + "fragment_ids is not supported with accelerator='cuvs'" + ) + num_rows = self._count_rows() + num_partitions = self._determine_num_partitions(num_partitions, num_rows) + num_subvectors = self._normalize_pq_params(num_subvectors, self.dimension) + return prepare_global_ivf_pq_on_cuvs( + self.dataset, + self.column[0], + num_partitions, + num_subvectors, + distance_type=distance_type, + accelerator=accelerator, + sample_rate=sample_rate, + max_iters=max_iters, ) # Global IVF training diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 9606c91a724..e5f10028839 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -9,9 +9,11 @@ import string import tempfile import time +from pathlib import Path from typing import Optional import lance +import lance.cuvs as lance_cuvs import numpy as np import pyarrow as pa import pyarrow.compute as pc @@ -534,11 +536,20 @@ def test_create_index_accelerator_fallback(tmp_path, caplog): ) -def test_create_index_rejects_cuvs_accelerator(tmp_path): +def test_create_index_requires_external_cuvs_backend(tmp_path, monkeypatch): tbl = create_table() dataset = lance.write_dataset(tbl, tmp_path) + original_import_module = lance_cuvs.import_module - with pytest.raises(ValueError, match="not built into Lance"): + def _raise_missing(name): + if name == "lance_cuvs": + raise ModuleNotFoundError("No module named 'lance_cuvs'") + return original_import_module(name) + + monkeypatch.setattr(lance_cuvs, "import_module", _raise_missing) + with pytest.raises( + ModuleNotFoundError, match="requires the external 'lance-cuvs' package" + ): dataset.create_index( "vector", index_type="IVF_PQ", @@ -548,18 +559,142 @@ def test_create_index_rejects_cuvs_accelerator(tmp_path): ) -def test_prepare_global_ivf_pq_rejects_cuvs_accelerator(tmp_path): +class _FakeCuvsTraining: + def __init__(self, ivf_centroids, pq_codebook): + self._ivf_centroids = ivf_centroids + self._pq_codebook = pq_codebook + + def ivf_centroids(self): + return self._ivf_centroids + + def pq_codebook(self): + return self._pq_codebook + + +class _FakeCuvsArtifact: + def __init__(self, artifact_uri, files): + self.artifact_uri = artifact_uri + self.files = files + + +def _make_fake_cuvs_training(num_partitions: int = 4, dimension: int = 128): + centroids = pa.FixedSizeListArray.from_arrays( + pa.array(np.arange(num_partitions * dimension, dtype=np.float32)), + dimension, + ) + codebook = pa.FixedSizeListArray.from_arrays( + pa.array(np.arange(16 * 256 * 8, dtype=np.float32)), + 8, + ) + return _FakeCuvsTraining(centroids, codebook) + + +def test_build_vector_index_on_cuvs_delegates_to_external_backend(tmp_path, monkeypatch): ds = _make_sample_dataset_base(tmp_path, "prepare_ivf_pq_cuvs_ds", 512, 128) - builder = IndicesBuilder(ds, "vector") - with pytest.raises(ValueError, match="not built into Lance"): - builder.prepare_global_ivf_pq( - num_partitions=4, - num_subvectors=16, - distance_type="l2", - accelerator="cuvs", + calls = {} + training = _make_fake_cuvs_training() + + class _FakeBackend: + def train_ivf_pq(self, dataset_uri, column, **kwargs): + calls["train"] = { + "dataset_uri": dataset_uri, + "column": column, + **kwargs, + } + return training + + def build_ivf_pq_artifact(self, dataset_uri, column, **kwargs): + calls["build"] = { + "dataset_uri": dataset_uri, + "column": column, + **kwargs, + } + return _FakeCuvsArtifact( + artifact_uri=str(tmp_path / "artifact"), + files=[str(tmp_path / "artifact" / "data.lance")], + ) + + monkeypatch.setattr(lance_cuvs, "_require_lance_cuvs", lambda: _FakeBackend()) + + artifact_uri, files, ivf_centroids, pq_codebook = ( + lance_cuvs.build_vector_index_on_cuvs( + ds, + "vector", + "l2", + "cuvs", + 4, + 16, + dst_dataset_uri=tmp_path / "artifact_root", sample_rate=7, max_iters=20, + num_bits=4, + batch_size=4096, + filter_nan=False, ) + ) + + assert calls["train"] == { + "dataset_uri": ds.uri, + "column": "vector", + "metric_type": "l2", + "num_partitions": 4, + "num_sub_vectors": 16, + "sample_rate": 7, + "max_iters": 20, + "num_bits": 4, + "filter_nan": False, + } + assert calls["build"]["dataset_uri"] == ds.uri + assert calls["build"]["column"] == "vector" + assert calls["build"]["training"] is training + assert calls["build"]["artifact_uri"] == str(tmp_path / "artifact_root") + assert calls["build"]["batch_size"] == 4096 + assert calls["build"]["filter_nan"] is False + assert artifact_uri == str(tmp_path / "artifact") + assert files == [str(tmp_path / "artifact" / "data.lance")] + assert ivf_centroids.equals(training.ivf_centroids()) + assert pq_codebook.equals(training.pq_codebook()) + + +def test_prepare_global_ivf_pq_delegates_to_external_cuvs_backend(tmp_path, monkeypatch): + ds = _make_sample_dataset_base(tmp_path, "prepare_ivf_pq_cuvs_ds", 512, 128) + builder = IndicesBuilder(ds, "vector") + training = _make_fake_cuvs_training() + calls = {} + + class _FakeBackend: + def train_ivf_pq(self, dataset_uri, column, **kwargs): + calls["train"] = { + "dataset_uri": dataset_uri, + "column": column, + **kwargs, + } + return training + + monkeypatch.setattr(lance_cuvs, "_require_lance_cuvs", lambda: _FakeBackend()) + + prepared = builder.prepare_global_ivf_pq( + num_partitions=4, + num_subvectors=16, + distance_type="l2", + accelerator="cuvs", + sample_rate=7, + max_iters=20, + ) + + assert calls["train"] == { + "dataset_uri": ds.uri, + "column": "vector", + "metric_type": "l2", + "num_partitions": 4, + "num_sub_vectors": 16, + "sample_rate": 7, + "max_iters": 20, + "num_bits": 8, + "filter_nan": True, + } + assert prepared["ivf_centroids"].equals(training.ivf_centroids()) + assert prepared["pq_codebook"].equals(training.pq_codebook()) def test_create_index_rejects_missing_precomputed_partition_artifact(tmp_path): From d60e11f8f63d72c776f290e990bb135e35215bbc Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Wed, 8 Apr 2026 20:05:10 +0800 Subject: [PATCH 18/21] fix: remove merge leftover import --- rust/lance/src/index/vector/builder.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/rust/lance/src/index/vector/builder.rs b/rust/lance/src/index/vector/builder.rs index fe801fc68aa..3e85a9d40b3 100644 --- a/rust/lance/src/index/vector/builder.rs +++ b/rust/lance/src/index/vector/builder.rs @@ -3,7 +3,6 @@ use std::cmp::Ordering; use std::collections::HashSet; -use std::future; use std::path::Path as StdPath; use std::sync::Arc; use std::{collections::HashMap, pin::Pin}; From f2548505e7b05488fe9cef4fba2aeeddfe3e8a6c Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Thu, 9 Apr 2026 00:29:18 +0800 Subject: [PATCH 19/21] refactor: drop transitional cuvs compatibility paths --- python/src/dataset.rs | 9 +- rust/lance-index/src/vector/ivf/builder.rs | 3 +- rust/lance-index/src/vector/ivf/shuffler.rs | 10 +- rust/lance-index/src/vector/v3/shuffler.rs | 202 +------------------- rust/lance/src/index/vector/builder.rs | 133 +------------ rust/lance/src/index/vector/ivf.rs | 4 +- rust/lance/src/index/vector/ivf/builder.rs | 5 +- rust/lance/src/index/vector/utils.rs | 2 +- 8 files changed, 30 insertions(+), 338 deletions(-) diff --git a/python/src/dataset.rs b/python/src/dataset.rs index 825f835f56a..4b058ce8382 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -20,6 +20,7 @@ use chrono::{Duration, TimeDelta, Utc}; use futures::{StreamExt, TryFutureExt}; use lance_index::vector::bq::RQBuildParams; use log::error; +use object_store::path::Path; use pyo3::exceptions::{PyStopIteration, PyTypeError}; use pyo3::types::{PyBytes, PyInt, PyList, PySet, PyString, PyTuple}; use pyo3::{IntoPyObjectExt, prelude::*}; @@ -3624,12 +3625,18 @@ fn prepare_vector_index_params( kwargs.get_item("precomputed_shuffle_buffers_path")?, ) { (Some(l), Some(p)) => { + let path = Path::parse(p.to_string()).map_err(|e| { + PyValueError::new_err(format!( + "Failed to parse precomputed_shuffle_buffers_path: {}", + e + )) + })?; let list = l .downcast::()? .iter() .map(|f| f.to_string()) .collect(); - ivf_params.precomputed_shuffle_buffers = Some((p.to_string(), list)); + ivf_params.precomputed_shuffle_buffers = Some((path, list)); } (None, None) => {} _ => { diff --git a/rust/lance-index/src/vector/ivf/builder.rs b/rust/lance-index/src/vector/ivf/builder.rs index 9dfcd675be8..caccd92d6c4 100644 --- a/rust/lance-index/src/vector/ivf/builder.rs +++ b/rust/lance-index/src/vector/ivf/builder.rs @@ -11,6 +11,7 @@ use arrow_array::{Array, FixedSizeListArray, UInt32Array, UInt64Array}; use futures::TryStreamExt; use lance_core::error::{Error, Result}; use lance_io::stream::RecordBatchStream; +use object_store::path::Path; /// Parameters to build IVF partitions #[derive(Debug, Clone)] @@ -46,7 +47,7 @@ pub struct IvfBuildParams { /// requires `centroids` to be set /// /// The input is expected to be (/dir/to/buffers, [buffer1.lance, buffer2.lance, ...]) - pub precomputed_shuffle_buffers: Option<(String, Vec)>, + pub precomputed_shuffle_buffers: Option<(Path, Vec)>, /// Precomputed partitioned artifact produced by an external backend. /// Mutually exclusive with other precomputed inputs and requires `centroids` to be set. diff --git a/rust/lance-index/src/vector/ivf/shuffler.rs b/rust/lance-index/src/vector/ivf/shuffler.rs index f78be7b0be2..f4e03c8f036 100644 --- a/rust/lance-index/src/vector/ivf/shuffler.rs +++ b/rust/lance-index/src/vector/ivf/shuffler.rs @@ -246,18 +246,12 @@ pub async fn shuffle_dataset( num_partitions: u32, shuffle_partition_batches: usize, shuffle_partition_concurrency: usize, - precomputed_shuffle_buffers: Option<(String, Vec)>, + precomputed_shuffle_buffers: Option<(Path, Vec)>, ) -> Result>>> { // step 1: either use precomputed shuffle files or write shuffle data to a file let shuffler = if let Some((path, buffers)) = precomputed_shuffle_buffers { info!("Precomputed shuffle files provided, skip calculation of IVF partition."); - if path.contains("://") { - return Err(Error::not_supported( - "legacy IVF shuffler does not support remote precomputed_shuffle_buffers; use the V3 vector index builder path instead".to_string(), - )); - } - let mut shuffler = - IvfShuffler::try_new(num_partitions, Some(Path::parse(&path)?), true, None)?; + let mut shuffler = IvfShuffler::try_new(num_partitions, Some(path), true, None)?; unsafe { shuffler.set_unsorted_buffers(&buffers); } diff --git a/rust/lance-index/src/vector/v3/shuffler.rs b/rust/lance-index/src/vector/v3/shuffler.rs index 45c719d523a..20bed4cdc23 100644 --- a/rust/lance-index/src/vector/v3/shuffler.rs +++ b/rust/lance-index/src/vector/v3/shuffler.rs @@ -4,7 +4,6 @@ //! Shuffler is a component that takes a stream of record batches and shuffles them into //! the corresponding IVF partitions. -use std::collections::HashMap; use std::ops::Range; use std::sync::atomic::AtomicU64; use std::sync::{Arc, Mutex}; @@ -37,13 +36,6 @@ use object_store::path::Path; use crate::vector::{LOSS_METADATA_KEY, PART_ID_COLUMN}; -const SHUFFLE_NUM_PARTITIONS_METADATA_KEY: &str = "lance:shuffle:num_partitions"; -const SHUFFLE_NUM_BATCHES_METADATA_KEY: &str = "lance:shuffle:num_batches"; -const SHUFFLE_PARTITION_COUNTS_METADATA_KEY: &str = "lance:shuffle:partition_counts"; -const SHUFFLE_TOTAL_LOSS_METADATA_KEY: &str = "lance:shuffle:total_loss"; -pub const SHUFFLE_DATA_FILE_NAME: &str = "shuffle_data.lance"; -pub const SHUFFLE_OFFSETS_FILE_NAME: &str = "shuffle_offsets.lance"; - #[async_trait::async_trait] /// A reader that can read the shuffled partitions. pub trait ShuffleReader: Send + Sync { @@ -443,7 +435,7 @@ impl Shuffler for TwoFileShuffler { ); // Create data file writer - let data_path = self.output_dir.child(SHUFFLE_DATA_FILE_NAME); + let data_path = self.output_dir.child("shuffle_data.lance"); let spill_path = self.output_dir.child("shuffle_data.spill"); let writer = self.object_store.create(&data_path).await?; let mut file_writer = FileWriter::try_new( @@ -454,7 +446,7 @@ impl Shuffler for TwoFileShuffler { .with_page_metadata_spill(self.object_store.clone(), spill_path); // Create offsets file writer - let offsets_path = self.output_dir.child(SHUFFLE_OFFSETS_FILE_NAME); + let offsets_path = self.output_dir.child("shuffle_offsets.lance"); let spill_path = self.output_dir.child("shuffle_offsets.spill"); let writer = self.object_store.create(&offsets_path).await?; let mut offsets_writer = FileWriter::try_new( @@ -535,37 +527,12 @@ impl Shuffler for TwoFileShuffler { .await?; } - let partition_counts_json = serde_json::to_string(&partition_counts).map_err(|e| { - Error::invalid_input(format!("Failed to serialize shuffle partition counts: {e}")) - })?; - let num_partitions_str = num_partitions.to_string(); - let num_batches_str = num_batches - .load(std::sync::atomic::Ordering::Relaxed) - .to_string(); - let total_loss_str = total_loss.lock().unwrap().to_string(); - for writer in [&mut file_writer, &mut offsets_writer] { - writer.add_schema_metadata( - SHUFFLE_NUM_PARTITIONS_METADATA_KEY, - num_partitions_str.clone(), - ); - writer.add_schema_metadata(SHUFFLE_NUM_BATCHES_METADATA_KEY, num_batches_str.clone()); - writer.add_schema_metadata( - SHUFFLE_PARTITION_COUNTS_METADATA_KEY, - partition_counts_json.clone(), - ); - writer.add_schema_metadata(SHUFFLE_TOTAL_LOSS_METADATA_KEY, total_loss_str.clone()); - } - // Finish files file_writer.finish().await?; offsets_writer.finish().await?; - let num_batches = num_batches_str - .parse::() - .expect("num_batches string was produced from u64"); - let total_loss_val = total_loss_str - .parse::() - .expect("total_loss string was produced from f64"); + let num_batches = num_batches.load(std::sync::atomic::Ordering::Relaxed); + let total_loss_val = *total_loss.lock().unwrap(); TwoFileShuffleReader::try_new( self.object_store.clone(), @@ -590,46 +557,6 @@ pub struct TwoFileShuffleReader { } impl TwoFileShuffleReader { - pub async fn try_open_existing( - object_store: Arc, - output_dir: Path, - data_file: impl AsRef, - offsets_file: impl AsRef, - ) -> Result> { - let scheduler_config = SchedulerConfig::max_bandwidth(&object_store); - let scheduler = ScanScheduler::new(object_store, scheduler_config); - - let file_reader = FileReader::try_open( - scheduler - .open_file( - &output_dir.child(data_file.as_ref()), - &CachedFileSize::unknown(), - ) - .await?, - None, - Arc::::default(), - &LanceCache::no_cache(), - FileReaderOptions::default(), - ) - .await?; - - let offsets_reader = FileReader::try_open( - scheduler - .open_file( - &output_dir.child(offsets_file.as_ref()), - &CachedFileSize::unknown(), - ) - .await?, - None, - Arc::::default(), - &LanceCache::no_cache(), - FileReaderOptions::default(), - ) - .await?; - - Self::from_existing_readers(scheduler, file_reader, offsets_reader) - } - async fn try_new( object_store: Arc, output_dir: Path, @@ -645,7 +572,7 @@ impl TwoFileShuffleReader { let scheduler_config = SchedulerConfig::max_bandwidth(&object_store); let scheduler = ScanScheduler::new(object_store, scheduler_config); - let data_path = output_dir.child(SHUFFLE_DATA_FILE_NAME); + let data_path = output_dir.child("shuffle_data.lance"); let file_reader = FileReader::try_open( scheduler .open_file(&data_path, &CachedFileSize::unknown()) @@ -657,7 +584,7 @@ impl TwoFileShuffleReader { ) .await?; - let offsets_path = output_dir.child(SHUFFLE_OFFSETS_FILE_NAME); + let offsets_path = output_dir.child("shuffle_offsets.lance"); let offsets_reader = FileReader::try_open( scheduler .open_file(&offsets_path, &CachedFileSize::unknown()) @@ -680,87 +607,6 @@ impl TwoFileShuffleReader { })) } - fn from_existing_readers( - scheduler: Arc, - file_reader: FileReader, - offsets_reader: FileReader, - ) -> Result> { - let metadata: &HashMap = &offsets_reader.schema().metadata; - - let num_partitions = metadata - .get(SHUFFLE_NUM_PARTITIONS_METADATA_KEY) - .ok_or_else(|| { - Error::invalid_input(format!( - "Missing required metadata key {SHUFFLE_NUM_PARTITIONS_METADATA_KEY} in precomputed V3 shuffle offsets file" - )) - })? - .parse::() - .map_err(|e| { - Error::invalid_input(format!( - "Invalid value for {SHUFFLE_NUM_PARTITIONS_METADATA_KEY}: {e}" - )) - })?; - let num_batches = metadata - .get(SHUFFLE_NUM_BATCHES_METADATA_KEY) - .ok_or_else(|| { - Error::invalid_input(format!( - "Missing required metadata key {SHUFFLE_NUM_BATCHES_METADATA_KEY} in precomputed V3 shuffle offsets file" - )) - })? - .parse::() - .map_err(|e| { - Error::invalid_input(format!( - "Invalid value for {SHUFFLE_NUM_BATCHES_METADATA_KEY}: {e}" - )) - })?; - let partition_counts = serde_json::from_str::>( - metadata - .get(SHUFFLE_PARTITION_COUNTS_METADATA_KEY) - .ok_or_else(|| { - Error::invalid_input(format!( - "Missing required metadata key {SHUFFLE_PARTITION_COUNTS_METADATA_KEY} in precomputed V3 shuffle offsets file" - )) - })?, - ) - .map_err(|e| { - Error::invalid_input(format!( - "Invalid value for {SHUFFLE_PARTITION_COUNTS_METADATA_KEY}: {e}" - )) - })?; - if partition_counts.len() != num_partitions { - return Err(Error::invalid_input(format!( - "Precomputed V3 shuffle partition count length {} does not match num_partitions {}", - partition_counts.len(), - num_partitions - ))); - } - let total_loss = metadata - .get(SHUFFLE_TOTAL_LOSS_METADATA_KEY) - .map(|value| { - value.parse::().map_err(|e| { - Error::invalid_input(format!( - "Invalid value for {SHUFFLE_TOTAL_LOSS_METADATA_KEY}: {e}" - )) - }) - }) - .transpose()? - .unwrap_or(0.0); - - if num_batches == 0 { - return Ok(Box::new(EmptyReader)); - } - - Ok(Box::new(Self { - _scheduler: scheduler, - file_reader, - offsets_reader, - num_partitions, - num_batches, - partition_counts, - total_loss, - })) - } - async fn partition_ranges(&self, partition_id: usize) -> Result>> { let mut positions = Vec::with_capacity(self.num_batches as usize * 2); for batch_idx in 0..self.num_batches { @@ -997,42 +843,6 @@ mod tests { assert!((loss - 4.25).abs() < 1e-10, "expected 4.25, got {}", loss); } - #[tokio::test] - async fn test_two_file_shuffler_reopen_existing_files() { - let dir = TempStrDir::default(); - let output_dir = Path::from(dir.as_ref()); - let num_partitions = 3; - - let batch1 = make_batch(&[0, 1, 2], &[10, 20, 30], Some(1.5)); - let batch2 = make_batch(&[2, 0, 1, 0], &[40, 50, 60, 70], Some(2.0)); - - let shuffler = TwoFileShuffler::new(output_dir.clone(), num_partitions); - let stream = batches_to_stream(vec![batch1, batch2]); - let _ = shuffler.shuffle(stream).await.unwrap(); - - let reopened = TwoFileShuffleReader::try_open_existing( - Arc::new(ObjectStore::local()), - output_dir, - SHUFFLE_DATA_FILE_NAME, - SHUFFLE_OFFSETS_FILE_NAME, - ) - .await - .unwrap(); - - assert_eq!(reopened.partition_size(0).unwrap(), 3); - assert_eq!(reopened.partition_size(1).unwrap(), 2); - assert_eq!(reopened.partition_size(2).unwrap(), 2); - - let p0 = collect_partition(reopened.as_ref(), 0).await.unwrap(); - let vals: &Int32Array = p0.column_by_name("val").unwrap().as_primitive(); - let mut v: Vec = vals.iter().map(|x| x.unwrap()).collect(); - v.sort(); - assert_eq!(v, vec![10, 50, 70]); - - let loss = reopened.total_loss().unwrap(); - assert!((loss - 3.5).abs() < 1e-10, "expected 3.5, got {}", loss); - } - #[tokio::test] async fn test_two_file_shuffler_single_batch() { let dir = TempStrDir::default(); diff --git a/rust/lance/src/index/vector/builder.rs b/rust/lance/src/index/vector/builder.rs index 3e85a9d40b3..0edfbea4812 100644 --- a/rust/lance/src/index/vector/builder.rs +++ b/rust/lance/src/index/vector/builder.rs @@ -3,7 +3,6 @@ use std::cmp::Ordering; use std::collections::HashSet; -use std::path::Path as StdPath; use std::sync::Arc; use std::{collections::HashMap, pin::Pin}; @@ -45,10 +44,7 @@ use lance_index::vector::quantizer::{QuantizerMetadata, QuantizerStorage}; use lance_index::vector::shared::{SupportedIvfIndexType, write_unified_ivf_and_index_metadata}; use lance_index::vector::storage::STORAGE_METADATA_KEY; use lance_index::vector::transform::Flatten; -use lance_index::vector::v3::shuffler::{ - EmptyReader, IvfShufflerReader, SHUFFLE_DATA_FILE_NAME, SHUFFLE_OFFSETS_FILE_NAME, - TwoFileShuffleReader, -}; +use lance_index::vector::v3::shuffler::{EmptyReader, IvfShufflerReader}; use lance_index::vector::v3::subindex::SubIndexType; use lance_index::vector::{LOSS_METADATA_KEY, PART_ID_COLUMN, PQ_CODE_COLUMN, VectorIndex}; use lance_index::vector::{PART_ID_FIELD, ivf::storage::IvfModel}; @@ -71,9 +67,7 @@ use lance_index::{ MIN_PARTITION_SIZE_PERCENT, }; use lance_io::local::to_local_path; -use lance_io::object_store::{ - ObjectStore, ObjectStoreParams, ObjectStoreRegistry, StorageOptionsAccessor, -}; +use lance_io::object_store::ObjectStore; use lance_io::stream::RecordBatchStream; use lance_io::stream::RecordBatchStreamAdapter; use lance_linalg::distance::{DistanceType, Dot, L2, Normalize}; @@ -85,7 +79,6 @@ use tracing::{Level, instrument, span}; use crate::Dataset; use crate::dataset::ProjectionRequest; -use crate::dataset::builder::DatasetBuilder; use crate::dataset::index::dataset_format_version; use crate::index::vector::ivf::v2::PartitionEntry; use crate::index::vector::utils::infer_vector_dim; @@ -150,83 +143,6 @@ type BuildStream = Pin::Storage, S, f64)>>> + Send>>; impl IvfIndexBuilder { - fn precomputed_shuffle_buffers_uri(root: &str) -> String { - let uri = root.to_string(); - if uri.contains("://") { - uri - } else { - to_local_path(&Path::from(root)) - } - } - - fn precomputed_shuffle_buffers_root_uri(root: &str) -> String { - let uri = Self::precomputed_shuffle_buffers_uri(root); - if uri.ends_with("/data") { - uri.trim_end_matches("/data").to_string() - } else { - uri - } - } - - fn object_store_params(&self) -> ObjectStoreParams { - let mut params = ObjectStoreParams::default(); - if let Some(storage_options) = self - .ivf_params - .as_ref() - .and_then(|params| params.storage_options.clone()) - { - params.storage_options_accessor = Some(Arc::new( - StorageOptionsAccessor::with_static_options(storage_options), - )); - } - params - } - - async fn try_open_precomputed_v3_shuffle_reader( - &self, - root: &str, - files: &[String], - ) -> Result>> { - if files.len() != 2 { - return Ok(None); - } - - let mut data_file = None; - let mut offsets_file = None; - for file in files { - let Some(file_name) = StdPath::new(file).file_name() else { - return Ok(None); - }; - match file_name.to_string_lossy().as_ref() { - SHUFFLE_DATA_FILE_NAME => data_file = Some(SHUFFLE_DATA_FILE_NAME), - SHUFFLE_OFFSETS_FILE_NAME => offsets_file = Some(SHUFFLE_OFFSETS_FILE_NAME), - _ => return Ok(None), - } - } - let (Some(data_file), Some(offsets_file)) = (data_file, offsets_file) else { - return Ok(None); - }; - let registry = Arc::new(ObjectStoreRegistry::default()); - let params = self.object_store_params(); - let (object_store, output_dir) = ObjectStore::from_uri_and_params( - registry, - &Self::precomputed_shuffle_buffers_root_uri(root), - ¶ms, - ) - .await?; - - Ok(Some( - TwoFileShuffleReader::try_open_existing( - object_store, - output_dir, - data_file, - offsets_file, - ) - .await? - .into(), - )) - } - async fn try_open_precomputed_partition_artifact_reader( &self, uri: &str, @@ -644,38 +560,11 @@ impl IvfIndexBuilder .as_ref() .and_then(|p| p.precomputed_shuffle_buffers.as_ref()) { - Some((uri, files)) => { - if let Some(reader) = self - .try_open_precomputed_v3_shuffle_reader(uri, files) - .await? - { - log::info!("shuffle with precomputed V3 shuffle files from {}", uri); - self.shuffle_reader = Some(reader); - return Ok(()); - } - - let uri = Self::precomputed_shuffle_buffers_root_uri(uri); - let uri = if StdPath::new(&uri) - .file_name() - .is_some_and(|name| name == "data") - { - StdPath::new(&uri) - .parent() - .map(|path| path.to_string_lossy().to_string()) - .unwrap_or(uri) - } else { - uri - }; + Some((uri, _)) => { + let uri = to_local_path(uri); + let uri = uri.trim_end_matches("data"); log::info!("shuffle with precomputed shuffle buffers from {}", uri); - let mut builder = DatasetBuilder::from_uri(&uri); - if let Some(storage_options) = self - .ivf_params - .as_ref() - .and_then(|params| params.storage_options.clone()) - { - builder = builder.with_storage_options(storage_options); - } - let ds = builder.load().await?; + let ds = Dataset::open(uri).await?; ds.scan().try_into_stream().await? } _ => { @@ -2524,14 +2413,4 @@ mod tests { let row_ids = batches[0][ROW_ID].as_primitive::(); assert_eq!(row_ids.values(), &[4, 3, 2, 1, 0]); } - - #[test] - fn precomputed_shuffle_buffer_uri_preserves_remote_uri() { - assert_eq!( - IvfIndexBuilder::::precomputed_shuffle_buffers_root_uri( - "s3://bucket/shuffle" - ), - "s3://bucket/shuffle" - ); - } } diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs index 229d47f229b..34ce23f1eac 100644 --- a/rust/lance/src/index/vector/ivf.rs +++ b/rust/lance/src/index/vector/ivf.rs @@ -1818,7 +1818,7 @@ async fn write_ivf_pq_file( precomputed_partitions: Option>, shuffle_partition_batches: usize, shuffle_partition_concurrency: usize, - precomputed_shuffle_buffers: Option<(String, Vec)>, + precomputed_shuffle_buffers: Option<(Path, Vec)>, ) -> Result<()> { let path = index_dir.child(uuid).child(INDEX_FILE_NAME); let mut writer = object_store.create(&path).await?; @@ -1911,7 +1911,7 @@ async fn write_ivf_hnsw_file( precomputed_partitions: Option>, shuffle_partition_batches: usize, shuffle_partition_concurrency: usize, - precomputed_shuffle_buffers: Option<(String, Vec)>, + precomputed_shuffle_buffers: Option<(Path, Vec)>, ) -> Result<()> { let object_store = dataset.object_store(); let path = dataset.indices_dir().child(uuid).child(INDEX_FILE_NAME); diff --git a/rust/lance/src/index/vector/ivf/builder.rs b/rust/lance/src/index/vector/ivf/builder.rs index bcd47ae4057..9bd1ba95803 100644 --- a/rust/lance/src/index/vector/ivf/builder.rs +++ b/rust/lance/src/index/vector/ivf/builder.rs @@ -22,6 +22,7 @@ use lance_index::vector::{ivf::storage::IvfModel, transform::Transformer}; use lance_io::stream::RecordBatchStreamAdapter; use lance_table::io::manifest::ManifestDescribing; use log::info; +use object_store::path::Path; use tracing::instrument; use lance_core::{Error, ROW_ID, Result, traits::DatasetTakeRows}; @@ -54,7 +55,7 @@ pub(super) async fn build_partitions( precomputed_partitions: Option>, shuffle_partition_batches: usize, shuffle_partition_concurrency: usize, - precomputed_shuffle_buffers: Option<(String, Vec)>, + precomputed_shuffle_buffers: Option<(Path, Vec)>, ) -> Result<()> { let schema = data.schema(); if schema.column_with_name(column).is_none() { @@ -253,7 +254,7 @@ pub(super) async fn build_hnsw_partitions( precomputed_partitions: Option>, shuffle_partition_batches: usize, shuffle_partition_concurrency: usize, - precomputed_shuffle_buffers: Option<(String, Vec)>, + precomputed_shuffle_buffers: Option<(Path, Vec)>, ) -> Result<(Vec, IvfModel)> { let schema = data.schema(); if schema.column_with_name(column).is_none() { diff --git a/rust/lance/src/index/vector/utils.rs b/rust/lance/src/index/vector/utils.rs index 244a02c39bc..19156ac8eed 100644 --- a/rust/lance/src/index/vector/utils.rs +++ b/rust/lance/src/index/vector/utils.rs @@ -372,7 +372,7 @@ impl PartitionLoadLock { /// /// Handles both regular vector columns (FixedSizeList) and multivector columns /// (List\), flattening the latter. -pub fn vector_column_to_fsl(batch: &RecordBatch, column: &str) -> Result { +fn vector_column_to_fsl(batch: &RecordBatch, column: &str) -> Result { let array = get_column_from_batch(batch, column)?; match array.data_type() { arrow::datatypes::DataType::FixedSizeList(_, _) => Ok(array.as_fixed_size_list().clone()), From 51a141bd947533dff1659c6ea080197c164a5385 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Thu, 9 Apr 2026 00:33:54 +0800 Subject: [PATCH 20/21] docs: document partition artifact internals --- .../src/index/vector/partition_artifact.rs | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/rust/lance/src/index/vector/partition_artifact.rs b/rust/lance/src/index/vector/partition_artifact.rs index cfd2a0f2b4a..fe585f513be 100644 --- a/rust/lance/src/index/vector/partition_artifact.rs +++ b/rust/lance/src/index/vector/partition_artifact.rs @@ -39,6 +39,11 @@ const PARTITION_ARTIFACT_BUCKET_PREFIX: &str = "bucket-"; const PARTITION_ARTIFACT_FILE_VERSION: &str = "2.2"; const PARTITION_ARTIFACT_BUCKET_BUFFER_ROWS: usize = 32 * 1024; +/// Top-level manifest for a precomputed partition artifact. +/// +/// The manifest is intentionally small and JSON-encoded so an external backend +/// can materialize partition data once and Lance can reopen it later without +/// understanding any backend-specific details. #[derive(Debug, Serialize, Deserialize)] struct PartitionArtifactManifest { version: u32, @@ -50,6 +55,11 @@ struct PartitionArtifactManifest { partitions: Vec, } +/// Describes where one logical IVF partition lives inside the artifact. +/// +/// Multiple logical partitions can share the same physical file when they hash +/// to the same bucket. `ranges` records the row spans within that file that +/// belong to this partition. #[derive(Debug, Clone, Serialize, Deserialize)] struct PartitionArtifactPartition { #[serde(default)] @@ -60,12 +70,22 @@ struct PartitionArtifactPartition { ranges: Vec, } +/// A contiguous row range for a partition inside one bucket file. +/// +/// The builder sorts each finalized bucket by partition id, so a partition is +/// usually represented by a single range. The type still allows multiple runs +/// so the reader does not depend on that implementation detail. #[derive(Debug, Clone, Serialize, Deserialize)] struct PartitionArtifactRange { offset: u64, num_rows: u64, } +/// In-memory staging buffer for one bucket before it is flushed to disk. +/// +/// Batches arrive grouped arbitrarily by the backend. The builder first +/// appends rows into per-bucket buffers so it can write larger sequential runs +/// to temporary files instead of issuing tiny file writes. #[derive(Default, Debug)] struct BucketBuffer { row_ids: Vec, @@ -74,15 +94,26 @@ struct BucketBuffer { } impl BucketBuffer { + /// Number of staged rows currently buffered for this bucket. fn len(&self) -> usize { self.row_ids.len() } + /// Whether the bucket currently has any staged rows. fn is_empty(&self) -> bool { self.row_ids.is_empty() } } +/// Writes partition-addressable encoded rows for a later Lance finalization. +/// +/// The builder uses a two-phase layout: +/// 1. Append arbitrary input batches into temporary bucket files. +/// 2. Reopen each bucket, sort rows by partition id, and rewrite one finalized +/// bucket file plus a compact manifest that records per-partition ranges. +/// +/// This keeps the write path sequential and bounded in memory while still +/// giving the finalizer efficient partition reads. pub struct PartitionArtifactBuilder { object_store: Arc, root_dir: Path, @@ -96,6 +127,11 @@ pub struct PartitionArtifactBuilder { } impl PartitionArtifactBuilder { + /// Create a builder from a URI and optional storage options. + /// + /// This is the external entry point used by backends that only know an + /// artifact URI. It resolves the object store and then delegates to the + /// store-aware constructor. pub async fn try_new( uri: &str, num_partitions: usize, @@ -120,6 +156,12 @@ impl PartitionArtifactBuilder { Self::try_new_with_store(object_store, root_dir, num_partitions, pq_code_width) } + /// Create a builder against an already-resolved object store. + /// + /// The builder precomputes the temporary and final schemas and allocates + /// one staging buffer per bucket. Buckets are a write-time sharding scheme: + /// they are not visible to readers, but they keep memory usage bounded and + /// avoid one file per partition. pub fn try_new_with_store( object_store: Arc, root_dir: Path, @@ -177,6 +219,11 @@ impl PartitionArtifactBuilder { }) } + /// Append one encoded batch into the artifact staging area. + /// + /// Input batches must already contain row ids, partition ids, and PQ codes. + /// Rows are redistributed into bucket-local in-memory buffers and flushed to + /// temporary files once they become large enough. pub async fn append_batch(&mut self, batch: &RecordBatch) -> Result<()> { validate_input_batch(batch, self.pq_code_width)?; @@ -210,6 +257,11 @@ impl PartitionArtifactBuilder { Ok(()) } + /// Finalize the artifact and return the relative files that were created. + /// + /// Finalization flushes all remaining staging buffers, rewrites each bucket + /// into its final sorted form, and emits a manifest that lets Lance reopen + /// the artifact as a [`ShuffleReader`]. pub async fn finish( &mut self, metadata_file: &str, @@ -259,6 +311,12 @@ impl PartitionArtifactBuilder { Ok(files) } + /// Flush the current in-memory buffer for one bucket into its temporary + /// file. + /// + /// Temporary files preserve the original row order inside the bucket. The + /// expensive partition sort is deferred to `finalize_bucket`, so append-time + /// stays cheap. async fn flush_bucket(&mut self, bucket_id: usize) -> Result<()> { if self.buffers[bucket_id].is_empty() { return Ok(()); @@ -270,6 +328,8 @@ impl PartitionArtifactBuilder { Ok(()) } + /// Convert a bucket's staged vectors into a temporary batch and empty the + /// in-memory buffer. fn take_temp_batch(&mut self, bucket_id: usize) -> Result { let buffer = &mut self.buffers[bucket_id]; let row_ids = UInt64Array::from(mem::take(&mut buffer.row_ids)); @@ -284,6 +344,10 @@ impl PartitionArtifactBuilder { .map_err(Error::from) } + /// Lazily create the temporary writer for a bucket. + /// + /// Buckets that never receive rows never create a file, which keeps sparse + /// artifacts compact. async fn ensure_temp_writer(&mut self, bucket_id: usize) -> Result<&mut FileWriter> { if self.temp_writers[bucket_id].is_none() { let path = self.temp_bucket_path(bucket_id); @@ -299,6 +363,12 @@ impl PartitionArtifactBuilder { .expect("temp writer initialized")) } + /// Rewrite one temporary bucket into its final on-disk representation. + /// + /// All rows for the bucket are loaded, sorted by partition id, and written + /// to a single final bucket file that stores only the row id and PQ code. + /// The manifest is updated with the row ranges for each partition contained + /// in this bucket. async fn finalize_bucket( &self, bucket_id: usize, @@ -412,6 +482,7 @@ impl PartitionArtifactBuilder { Ok(Some(final_relative_path)) } + /// Path of the temporary file used while accumulating one bucket. fn temp_bucket_path(&self, bucket_id: usize) -> Path { self.root_dir .child(PARTITION_ARTIFACT_PARTITIONS_DIR) @@ -420,6 +491,7 @@ impl PartitionArtifactBuilder { )) } + /// Path of the finalized file for one bucket. fn final_bucket_path(&self, bucket_id: usize) -> Path { self.root_dir .child(PARTITION_ARTIFACT_PARTITIONS_DIR) @@ -428,6 +500,7 @@ impl PartitionArtifactBuilder { )) } + /// Relative path recorded in the manifest for one finalized bucket. fn final_bucket_relative_path(&self, bucket_id: usize) -> String { format!( "{PARTITION_ARTIFACT_PARTITIONS_DIR}/{PARTITION_ARTIFACT_BUCKET_PREFIX}{bucket_id:05}.lance" @@ -435,6 +508,11 @@ impl PartitionArtifactBuilder { } } +/// Reopens a partition artifact as a `ShuffleReader`. +/// +/// The final Lance builder consumes artifacts through the generic +/// [`ShuffleReader`] interface, so this adapter hides the manifest parsing and +/// file caching needed to expose partition-local record batch streams. #[derive(Debug)] pub(crate) struct PartitionArtifactShuffleReader { scheduler: Arc, @@ -444,6 +522,10 @@ pub(crate) struct PartitionArtifactShuffleReader { file_readers: Mutex>>, } +/// Writer options for all files stored inside a partition artifact. +/// +/// The artifact uses a fixed file version so external backends and Lance +/// finalization agree on the on-disk layout. fn file_writer_options() -> Result { Ok(FileWriterOptions { format_version: Some( @@ -460,6 +542,10 @@ fn file_writer_options() -> Result { }) } +/// Validate that a backend-produced batch matches the artifact contract. +/// +/// The builder is intentionally strict here because any schema drift would only +/// surface much later during finalization. fn validate_input_batch(batch: &RecordBatch, pq_code_width: usize) -> Result<()> { let Some(row_ids) = batch.column_by_name(ROW_ID) else { return Err(Error::invalid_input(format!( @@ -497,6 +583,7 @@ fn validate_input_batch(batch: &RecordBatch, pq_code_width: usize) -> Result<()> } } +/// Serialize a small JSON sidecar directly into the object store. async fn write_json( object_store: &ObjectStore, path: &Path, @@ -515,6 +602,7 @@ async fn write_json( } impl PartitionArtifactShuffleReader { + /// Open an artifact reader from a URI and optional storage options. pub(crate) async fn try_open( uri: &str, storage_options: Option<&HashMap>, @@ -537,6 +625,10 @@ impl PartitionArtifactShuffleReader { Self::try_open_with_store(object_store, root_dir).await } + /// Open an artifact reader once the object store has already been resolved. + /// + /// This reads the manifest once, validates it, and initializes the shared + /// scheduler and reader cache used by partition reads. async fn try_open_with_store(object_store: Arc, root_dir: Path) -> Result { let manifest_path = root_dir.child("manifest.json"); let manifest_bytes = object_store.read_one_all(&manifest_path).await?; @@ -574,6 +666,10 @@ impl PartitionArtifactShuffleReader { }) } + /// Open and cache a file reader for a finalized bucket file. + /// + /// Multiple logical partitions can point at the same bucket file, so the + /// reader cache prevents redundant file opens during finalization. async fn open_file_reader(&self, relative_path: &str) -> Result> { if let Some(reader) = self .file_readers @@ -606,6 +702,7 @@ impl PartitionArtifactShuffleReader { } } +/// Join a manifest-relative path onto the artifact root. fn join_relative_path(root_dir: &Path, relative_path: &str) -> Path { relative_path .split('/') @@ -615,6 +712,11 @@ fn join_relative_path(root_dir: &Path, relative_path: &str) -> Path { #[async_trait::async_trait] impl ShuffleReader for PartitionArtifactShuffleReader { + /// Return a stream over all rows belonging to one logical partition. + /// + /// The manifest already records the precise row ranges for each partition, + /// so the reader can issue targeted range reads without scanning unrelated + /// partitions. async fn read_partition( &self, partition_id: usize, @@ -659,6 +761,7 @@ impl ShuffleReader for PartitionArtifactShuffleReader { )))) } + /// Number of encoded rows available for one logical partition. fn partition_size(&self, partition_id: usize) -> Result { Ok(self .partitions @@ -667,6 +770,7 @@ impl ShuffleReader for PartitionArtifactShuffleReader { .unwrap_or(0)) } + /// Optional training loss propagated from the backend into the artifact. fn total_loss(&self) -> Option { self.total_loss } From 15e42fddf303aa8baaea5399cbeaefcead974b3b Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Thu, 9 Apr 2026 15:03:51 +0800 Subject: [PATCH 21/21] fix: stream partition artifact writes --- python/python/lance/cuvs.py | 5 +- python/python/lance/dataset.py | 1 + python/python/tests/test_vector_index.py | 3 + .../src/index/vector/partition_artifact.rs | 326 ++++++++---------- 4 files changed, 145 insertions(+), 190 deletions(-) diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py index ba7a1a67738..6bc8dbd5312 100644 --- a/python/python/lance/cuvs.py +++ b/python/python/lance/cuvs.py @@ -6,7 +6,7 @@ import os import tempfile from importlib import import_module -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional if TYPE_CHECKING: from pathlib import Path @@ -34,6 +34,7 @@ def build_vector_index_on_cuvs( num_partitions: int, num_sub_vectors: int, dst_dataset_uri: str | Path | None = None, + storage_options: Optional[dict[str, str]] = None, *, sample_rate: int = 256, max_iters: int = 50, @@ -60,6 +61,7 @@ def build_vector_index_on_cuvs( max_iters=max_iters, num_bits=num_bits, filter_nan=filter_nan, + storage_options=storage_options, ) artifact = backend.build_ivf_pq_artifact( dataset.uri, @@ -68,6 +70,7 @@ def build_vector_index_on_cuvs( artifact_uri=artifact_uri, batch_size=batch_size, filter_nan=filter_nan, + storage_options=storage_options, ) return ( artifact.artifact_uri, diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 316f18d642f..c786b8f7cce 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -2987,6 +2987,7 @@ def _create_index_impl( accelerator, num_partitions, num_sub_vectors, + storage_options=storage_options, sample_rate=kwargs.get("sample_rate", 256), max_iters=kwargs.get("max_iters", 50), num_bits=kwargs.get("num_bits", 8), diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 486e670f123..f6470a5bd8e 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -682,6 +682,7 @@ def build_ivf_pq_artifact(self, dataset_uri, column, **kwargs): 4, 16, dst_dataset_uri=tmp_path / "artifact_root", + storage_options={"region": "us-east-1"}, sample_rate=7, max_iters=20, num_bits=4, @@ -700,6 +701,7 @@ def build_ivf_pq_artifact(self, dataset_uri, column, **kwargs): "max_iters": 20, "num_bits": 4, "filter_nan": False, + "storage_options": {"region": "us-east-1"}, } assert calls["build"]["dataset_uri"] == ds.uri assert calls["build"]["column"] == "vector" @@ -707,6 +709,7 @@ def build_ivf_pq_artifact(self, dataset_uri, column, **kwargs): assert calls["build"]["artifact_uri"] == str(tmp_path / "artifact_root") assert calls["build"]["batch_size"] == 4096 assert calls["build"]["filter_nan"] is False + assert calls["build"]["storage_options"] == {"region": "us-east-1"} assert artifact_uri == str(tmp_path / "artifact") assert files == [str(tmp_path / "artifact" / "data.lance")] assert ivf_centroids.equals(training.ivf_centroids()) diff --git a/rust/lance/src/index/vector/partition_artifact.rs b/rust/lance/src/index/vector/partition_artifact.rs index fe585f513be..a721437358d 100644 --- a/rust/lance/src/index/vector/partition_artifact.rs +++ b/rust/lance/src/index/vector/partition_artifact.rs @@ -7,9 +7,8 @@ use std::ops::Range; use std::sync::{Arc, Mutex}; use arrow_array::cast::AsArray; -use arrow_array::{FixedSizeListArray, RecordBatch, UInt8Array, UInt32Array, UInt64Array}; +use arrow_array::{FixedSizeListArray, RecordBatch, UInt8Array, UInt64Array}; use arrow_schema::{DataType, Field, Schema as ArrowSchema}; -use futures::TryStreamExt; use lance_arrow::FixedSizeListArrayExt; use lance_core::cache::LanceCache; use lance_core::datatypes::Schema; @@ -34,7 +33,6 @@ const PARTITION_ARTIFACT_MANIFEST_VERSION: u32 = 1; const PARTITION_ARTIFACT_MANIFEST_FILE_NAME: &str = "manifest.json"; const PARTITION_ARTIFACT_PARTITIONS_DIR: &str = "partitions"; const PARTITION_ARTIFACT_DEFAULT_BUCKETS: usize = 256; -const PARTITION_ARTIFACT_STAGING_PREFIX: &str = ".staging-bucket-"; const PARTITION_ARTIFACT_BUCKET_PREFIX: &str = "bucket-"; const PARTITION_ARTIFACT_FILE_VERSION: &str = "2.2"; const PARTITION_ARTIFACT_BUCKET_BUFFER_ROWS: usize = 32 * 1024; @@ -107,23 +105,22 @@ impl BucketBuffer { /// Writes partition-addressable encoded rows for a later Lance finalization. /// -/// The builder uses a two-phase layout: -/// 1. Append arbitrary input batches into temporary bucket files. -/// 2. Reopen each bucket, sort rows by partition id, and rewrite one finalized -/// bucket file plus a compact manifest that records per-partition ranges. -/// -/// This keeps the write path sequential and bounded in memory while still -/// giving the finalizer efficient partition reads. +/// The builder uses bucket-local buffering to keep append-time memory bounded. +/// Each flush sorts only the current in-memory bucket and appends it directly to +/// the finalized bucket file, while the manifest accumulates per-partition row +/// ranges. This keeps the writer streaming and avoids a full read/sort/rewrite +/// pass at `finish()` time. pub struct PartitionArtifactBuilder { object_store: Arc, root_dir: Path, num_partitions: usize, num_buckets: usize, pq_code_width: usize, - temp_schema: Arc, final_schema: Arc, - temp_writers: Vec>, + final_writers: Vec>, buffers: Vec, + partitions: Vec, + bucket_row_counts: Vec, } impl PartitionArtifactBuilder { @@ -158,10 +155,10 @@ impl PartitionArtifactBuilder { /// Create a builder against an already-resolved object store. /// - /// The builder precomputes the temporary and final schemas and allocates - /// one staging buffer per bucket. Buckets are a write-time sharding scheme: - /// they are not visible to readers, but they keep memory usage bounded and - /// avoid one file per partition. + /// The builder precomputes the final schema and allocates one staging + /// buffer per bucket. Buckets are a write-time sharding scheme: they are + /// not visible to readers, but they keep memory usage bounded and avoid one + /// file per partition. pub fn try_new_with_store( object_store: Arc, root_dir: Path, @@ -182,18 +179,6 @@ impl PartitionArtifactBuilder { let num_buckets = num_partitions .min(PARTITION_ARTIFACT_DEFAULT_BUCKETS) .max(1); - let temp_schema = Arc::new(ArrowSchema::new(vec![ - Field::new(ROW_ID, DataType::UInt64, false), - Field::new(PART_ID_COLUMN, DataType::UInt32, false), - Field::new( - PQ_CODE_COLUMN, - DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::UInt8, true)), - pq_code_width as i32, - ), - true, - ), - ])); let final_schema = Arc::new(ArrowSchema::new(vec![ Field::new(ROW_ID, DataType::UInt64, false), Field::new( @@ -212,10 +197,18 @@ impl PartitionArtifactBuilder { num_partitions, num_buckets, pq_code_width, - temp_schema, final_schema, - temp_writers: (0..num_buckets).map(|_| None).collect(), + final_writers: (0..num_buckets).map(|_| None).collect(), buffers: (0..num_buckets).map(|_| BucketBuffer::default()).collect(), + partitions: vec![ + PartitionArtifactPartition { + path: None, + num_rows: 0, + ranges: Vec::new(), + }; + num_partitions + ], + bucket_row_counts: vec![0; num_buckets], }) } @@ -259,9 +252,9 @@ impl PartitionArtifactBuilder { /// Finalize the artifact and return the relative files that were created. /// - /// Finalization flushes all remaining staging buffers, rewrites each bucket - /// into its final sorted form, and emits a manifest that lets Lance reopen - /// the artifact as a [`ShuffleReader`]. + /// Finalization only needs to flush the remaining in-memory buffers and + /// persist the manifest because bucket files are already in their final + /// layout. pub async fn finish( &mut self, metadata_file: &str, @@ -270,25 +263,16 @@ impl PartitionArtifactBuilder { for bucket_id in 0..self.num_buckets { self.flush_bucket(bucket_id).await?; } - for writer in self.temp_writers.iter_mut() { + for writer in self.final_writers.iter_mut() { if let Some(writer) = writer.as_mut() { writer.finish().await?; } } - let mut partitions = vec![ - PartitionArtifactPartition { - path: None, - num_rows: 0, - ranges: Vec::new(), - }; - self.num_partitions - ]; let mut artifact_files = Vec::with_capacity(self.num_buckets + 1); - for bucket_id in 0..self.num_buckets { - if let Some(relative_path) = self.finalize_bucket(bucket_id, &mut partitions).await? { - artifact_files.push(relative_path); + if self.final_writers[bucket_id].is_some() { + artifact_files.push(self.final_bucket_relative_path(bucket_id)); } } @@ -297,7 +281,7 @@ impl PartitionArtifactBuilder { num_partitions: self.num_partitions, metadata_file: Some(metadata_file.to_string()), total_loss, - partitions, + partitions: self.partitions.clone(), }; write_json( self.object_store.as_ref(), @@ -311,153 +295,39 @@ impl PartitionArtifactBuilder { Ok(files) } - /// Flush the current in-memory buffer for one bucket into its temporary - /// file. + /// Flush the current in-memory buffer for one bucket into its finalized + /// bucket file. /// - /// Temporary files preserve the original row order inside the bucket. The - /// expensive partition sort is deferred to `finalize_bucket`, so append-time - /// stays cheap. + /// Each flush sorts only the buffered rows for this bucket and appends them + /// to the final file while recording new manifest ranges for the affected + /// partitions. async fn flush_bucket(&mut self, bucket_id: usize) -> Result<()> { if self.buffers[bucket_id].is_empty() { return Ok(()); } - let batch = self.take_temp_batch(bucket_id)?; - let writer = self.ensure_temp_writer(bucket_id).await?; - writer.write_batch(&batch).await?; - Ok(()) - } - - /// Convert a bucket's staged vectors into a temporary batch and empty the - /// in-memory buffer. - fn take_temp_batch(&mut self, bucket_id: usize) -> Result { let buffer = &mut self.buffers[bucket_id]; let row_ids = UInt64Array::from(mem::take(&mut buffer.row_ids)); - let part_ids = UInt32Array::from(mem::take(&mut buffer.partition_ids)); + let part_ids = mem::take(&mut buffer.partition_ids); let pq_values = UInt8Array::from(mem::take(&mut buffer.pq_values)); - let pq_codes = - FixedSizeListArray::try_new_from_values(pq_values, self.pq_code_width as i32)?; - RecordBatch::try_new( - self.temp_schema.clone(), - vec![Arc::new(row_ids), Arc::new(part_ids), Arc::new(pq_codes)], - ) - .map_err(Error::from) - } - - /// Lazily create the temporary writer for a bucket. - /// - /// Buckets that never receive rows never create a file, which keeps sparse - /// artifacts compact. - async fn ensure_temp_writer(&mut self, bucket_id: usize) -> Result<&mut FileWriter> { - if self.temp_writers[bucket_id].is_none() { - let path = self.temp_bucket_path(bucket_id); - let writer = FileWriter::try_new( - self.object_store.create(&path).await?, - Schema::try_from(self.temp_schema.as_ref())?, - file_writer_options()?, - )?; - self.temp_writers[bucket_id] = Some(writer); - } - Ok(self.temp_writers[bucket_id] - .as_mut() - .expect("temp writer initialized")) - } - - /// Rewrite one temporary bucket into its final on-disk representation. - /// - /// All rows for the bucket are loaded, sorted by partition id, and written - /// to a single final bucket file that stores only the row id and PQ code. - /// The manifest is updated with the row ranges for each partition contained - /// in this bucket. - async fn finalize_bucket( - &self, - bucket_id: usize, - partitions: &mut [PartitionArtifactPartition], - ) -> Result> { - let temp_path = self.temp_bucket_path(bucket_id); - if !self.object_store.exists(&temp_path).await? { - return Ok(None); - } - - let reader = FileReader::try_open( - ScanScheduler::new( - self.object_store.clone(), - SchedulerConfig::max_bandwidth(&self.object_store), - ) - .open_file(&temp_path, &CachedFileSize::unknown()) - .await?, - None, - Arc::::default(), - &LanceCache::no_cache(), - FileReaderOptions::default(), - ) - .await?; - - let batches = reader - .read_stream( - ReadBatchParams::RangeFull, - u32::MAX, - 16, - FilterExpression::no_filter(), - )? - .try_collect::>() - .await?; - let total_rows = batches.iter().map(|batch| batch.num_rows()).sum::(); - if total_rows == 0 { - self.object_store.delete(&temp_path).await?; - return Ok(None); - } - - let mut row_ids = Vec::with_capacity(total_rows); - let mut partition_ids = Vec::with_capacity(total_rows); - let mut pq_values = Vec::with_capacity(total_rows * self.pq_code_width); - for batch in batches { - let batch_row_ids = batch[ROW_ID].as_primitive::(); - let batch_partition_ids = - batch[PART_ID_COLUMN].as_primitive::(); - let batch_pq = batch[PQ_CODE_COLUMN].as_fixed_size_list(); - let batch_pq_values = batch_pq - .values() - .as_primitive::(); - row_ids.extend(batch_row_ids.values().iter().copied()); - partition_ids.extend(batch_partition_ids.values().iter().copied()); - pq_values.extend_from_slice(batch_pq_values.values().as_ref()); - } + let total_rows = row_ids.len(); let mut permutation = (0..total_rows).collect::>(); - permutation.sort_unstable_by_key(|&idx| partition_ids[idx]); + permutation.sort_unstable_by_key(|&idx| part_ids[idx]); let mut sorted_row_ids = Vec::with_capacity(total_rows); let mut sorted_partition_ids = Vec::with_capacity(total_rows); let mut sorted_pq_values = Vec::with_capacity(total_rows * self.pq_code_width); for idx in permutation { - sorted_row_ids.push(row_ids[idx]); - sorted_partition_ids.push(partition_ids[idx]); + sorted_row_ids.push(row_ids.value(idx)); + sorted_partition_ids.push(part_ids[idx]); let start = idx * self.pq_code_width; let end = start + self.pq_code_width; - sorted_pq_values.extend_from_slice(&pq_values[start..end]); + sorted_pq_values.extend_from_slice(&pq_values.values()[start..end]); } - let final_path = self.final_bucket_path(bucket_id); + let file_offset = self.bucket_row_counts[bucket_id]; let final_relative_path = self.final_bucket_relative_path(bucket_id); - let mut writer = FileWriter::try_new( - self.object_store.create(&final_path).await?, - Schema::try_from(self.final_schema.as_ref())?, - file_writer_options()?, - )?; - let final_batch = RecordBatch::try_new( - self.final_schema.clone(), - vec![ - Arc::new(UInt64Array::from(sorted_row_ids)), - Arc::new(FixedSizeListArray::try_new_from_values( - UInt8Array::from(sorted_pq_values), - self.pq_code_width as i32, - )?), - ], - )?; - writer.write_batch(&final_batch).await?; - writer.finish().await?; - let mut offset = 0usize; while offset < sorted_partition_ids.len() { let partition_id = sorted_partition_ids[offset] as usize; @@ -467,28 +337,59 @@ impl PartitionArtifactBuilder { { end += 1; } - partitions[partition_id] = PartitionArtifactPartition { - path: Some(final_relative_path.clone()), - num_rows: end - offset, - ranges: vec![PartitionArtifactRange { - offset: offset as u64, - num_rows: (end - offset) as u64, - }], - }; + let partition = &mut self.partitions[partition_id]; + match &partition.path { + Some(existing) if existing != &final_relative_path => { + return Err(Error::io(format!( + "partition {} is split across multiple bucket files: '{}' vs '{}'", + partition_id, existing, final_relative_path + ))); + } + None => partition.path = Some(final_relative_path.clone()), + _ => {} + } + partition.num_rows += end - offset; + partition.ranges.push(PartitionArtifactRange { + offset: file_offset + offset as u64, + num_rows: (end - offset) as u64, + }); offset = end; } - self.object_store.delete(&temp_path).await?; - Ok(Some(final_relative_path)) + let pq_codes = FixedSizeListArray::try_new_from_values( + UInt8Array::from(sorted_pq_values), + self.pq_code_width as i32, + )?; + let final_batch = RecordBatch::try_new( + self.final_schema.clone(), + vec![ + Arc::new(UInt64Array::from(sorted_row_ids)), + Arc::new(pq_codes), + ], + )?; + let writer = self.ensure_final_writer(bucket_id).await?; + writer.write_batch(&final_batch).await?; + self.bucket_row_counts[bucket_id] += total_rows as u64; + Ok(()) } - /// Path of the temporary file used while accumulating one bucket. - fn temp_bucket_path(&self, bucket_id: usize) -> Path { - self.root_dir - .child(PARTITION_ARTIFACT_PARTITIONS_DIR) - .child(format!( - "{PARTITION_ARTIFACT_STAGING_PREFIX}{bucket_id:05}.lance" - )) + /// Lazily create the finalized writer for a bucket. + /// + /// Buckets that never receive rows never create a file, which keeps sparse + /// artifacts compact. + async fn ensure_final_writer(&mut self, bucket_id: usize) -> Result<&mut FileWriter> { + if self.final_writers[bucket_id].is_none() { + let path = self.final_bucket_path(bucket_id); + let writer = FileWriter::try_new( + self.object_store.create(&path).await?, + Schema::try_from(self.final_schema.as_ref())?, + file_writer_options()?, + )?; + self.final_writers[bucket_id] = Some(writer); + } + Ok(self.final_writers[bucket_id] + .as_mut() + .expect("final writer initialized")) } /// Path of the finalized file for one bucket. @@ -1057,4 +958,51 @@ mod tests { .unwrap_err(); assert!(matches!(error, Error::InvalidInput { .. })); } + + #[tokio::test] + async fn partition_artifact_builder_records_multiple_ranges_for_repeated_flushes() { + let tempdir = tempfile::tempdir().unwrap(); + let root_dir = tempdir.path().join("artifact"); + fs::create_dir_all(&root_dir).unwrap(); + let object_store = Arc::new(ObjectStore::local()); + let root_path = Path::from_filesystem_path(&root_dir).unwrap(); + + let mut builder = + PartitionArtifactBuilder::try_new_with_store(object_store, root_path, 4, 2).unwrap(); + let num_rows = PARTITION_ARTIFACT_BUCKET_BUFFER_ROWS + 1024; + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new(ROW_ID, DataType::UInt64, false), + Field::new(PART_ID_COLUMN, DataType::UInt32, false), + Field::new( + PQ_CODE_COLUMN, + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::UInt8, true)), 2), + true, + ), + ])); + let row_ids = UInt64Array::from_iter_values((0..num_rows as u64).into_iter()); + let part_ids = UInt32Array::from_iter_values((0..num_rows).map(|_| 0_u32)); + let pq_values = UInt8Array::from_iter_values((0..num_rows * 2).map(|v| (v % 251) as u8)); + let pq_codes = FixedSizeListArray::try_new_from_values(pq_values, 2).unwrap(); + let batch = RecordBatch::try_new( + schema, + vec![Arc::new(row_ids), Arc::new(part_ids), Arc::new(pq_codes)], + ) + .unwrap(); + + builder.append_batch(&batch).await.unwrap(); + builder.finish("metadata.lance", None).await.unwrap(); + + let manifest: PartitionArtifactManifest = + serde_json::from_slice(&fs::read(root_dir.join("manifest.json")).unwrap()).unwrap(); + assert_eq!(manifest.partitions[0].num_rows, num_rows); + assert_eq!(manifest.partitions[0].ranges.len(), 2); + assert_eq!( + manifest.partitions[0].ranges[0].num_rows, + PARTITION_ARTIFACT_BUCKET_BUFFER_ROWS as u64 + ); + assert_eq!( + manifest.partitions[0].ranges[1].offset, + PARTITION_ARTIFACT_BUCKET_BUFFER_ROWS as u64 + ); + } }