From f2be9cc3580c4cf1f987bca7ae0f41d6ed9cdaf5 Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Wed, 1 Apr 2026 02:01:12 +0800
Subject: [PATCH 01/21] python: add explicit cuvs accelerator path

---
 python/python/lance/cuvs.py              | 238 +++++++++++++++++++++++
 python/python/lance/dataset.py           | 142 +++++++++-----
 python/python/lance/indices/builder.py   |  25 +++
 python/python/tests/test_vector_index.py | 129 ++++++++++++
 4 files changed, 480 insertions(+), 54 deletions(-)
 create mode 100644 python/python/lance/cuvs.py

diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py
new file mode 100644
index 00000000000..ab46eb8a432
--- /dev/null
+++ b/python/python/lance/cuvs.py
@@ -0,0 +1,238 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright The Lance Authors
+
+from __future__ import annotations
+
+from importlib import import_module
+from typing import Tuple
+
+import pyarrow as pa
+
+from .dependencies import numpy as np
+
+
+def is_cuvs_accelerator(accelerator: object) -> bool:
+    return accelerator == "cuvs"
+
+
+def _require_cuvs():
+    try:
+        return import_module("cuvs.neighbors.ivf_pq")
+    except ModuleNotFoundError as exc:
+        raise ModuleNotFoundError(
+            "accelerator='cuvs' requires the 'cuvs' package to be installed"
+        ) from exc
+
+
+def _optional_cupy():
+    try:
+        return import_module("cupy")
+    except ModuleNotFoundError:
+        return None
+
+
+def _metric_to_cuvs(metric_type: str) -> str:
+    metric_type = metric_type.lower()
+    if metric_type in {"l2", "euclidean"}:
+        return "sqeuclidean"
+    if metric_type == "dot":
+        return "inner_product"
+    if metric_type == "cosine":
+        return "cosine"
+    raise ValueError(f"Metric '{metric_type}' is not supported by cuVS IVF_PQ")
+
+
+def _column_to_numpy(table: pa.Table, column: str) -> np.ndarray:
+    array = table.column(column).combine_chunks()
+    values = array.to_pylist()
+    if len(values) == 0:
+        raise ValueError("cuVS training requires at least one training vector")
+    matrix = np.asarray(values)
+    if matrix.ndim != 2:
+        raise ValueError(
+            f"Expected a 2D training matrix for column '{column}', got {matrix.shape}"
+        )
+    if matrix.dtype == np.float64:
+        matrix = matrix.astype(np.float32)
+    elif matrix.dtype not in (np.float16, np.float32):
+        matrix = matrix.astype(np.float32)
+    return matrix
+
+
+def _as_numpy(array_like) -> np.ndarray:
+    if isinstance(array_like, np.ndarray):
+        return array_like
+    try:
+        array = np.asarray(array_like)
+        if isinstance(array, np.ndarray):
+            return array
+    except Exception:
+        pass
+
+    if hasattr(array_like, "get"):
+        return np.asarray(array_like.get())
+
+    cupy = _optional_cupy()
+    if cupy is not None:
+        return cupy.asnumpy(array_like)
+
+    raise TypeError("Unable to convert cuVS output to numpy")
+
+
+def _normalize_centroids(index, num_partitions: int, dimension: int) -> np.ndarray:
+    centroids = _as_numpy(index.centers)
+    if centroids.shape != (num_partitions, dimension):
+        raise ValueError(
+            "cuVS returned incompatible IVF centroids shape: "
+            f"expected {(num_partitions, dimension)}, got {centroids.shape}"
+        )
+    return centroids
+
+
+def _normalize_pq_codebook(
+    index, num_sub_vectors: int, num_bits: int, dimension: int
+) -> np.ndarray:
+    pq_book_size = 1 << num_bits
+    subvector_dim = dimension // num_sub_vectors
+    pq_centers = _as_numpy(index.pq_centers)
+
+    expected_shapes = {
+        (num_sub_vectors, subvector_dim, pq_book_size): (0, 2, 1),
+        (num_sub_vectors, pq_book_size, subvector_dim): None,
+    }
+    transpose = expected_shapes.get(pq_centers.shape)
+    if transpose is None and pq_centers.shape not in expected_shapes:
+        raise ValueError(
+            "cuVS returned incompatible PQ codebook shape: expected one of "
+            f"{list(expected_shapes.keys())}, got {pq_centers.shape}"
+        )
+    if transpose is not None:
+        pq_centers = np.transpose(pq_centers, transpose)
+    return pq_centers
+
+
+def _estimate_trainset_fraction(
+    num_rows: int, num_partitions: int, sample_rate: int
+) -> float:
+    if num_rows <= 0:
+        raise ValueError("cuVS training requires a non-empty dataset")
+    desired_rows = max(num_partitions * sample_rate, 256 * 256)
+    return min(1.0, desired_rows / num_rows)
+
+
+def train_ivf_pq_on_cuvs(
+    dataset,
+    column: str,
+    num_partitions: int,
+    metric_type: str,
+    accelerator: str,
+    num_sub_vectors: int,
+    *,
+    sample_rate: int = 256,
+    max_iters: int = 50,
+    num_bits: int = 8,
+    filter_nan: bool = True,
+) -> Tuple[np.ndarray, np.ndarray]:
+    if accelerator != "cuvs":
+        raise ValueError("cuVS acceleration only supports accelerator='cuvs'")
+    if num_bits != 8:
+        raise ValueError("cuVS IVF_PQ integration currently supports only num_bits=8")
+
+    dimension = dataset.schema.field(column).type.list_size
+    if dimension % num_sub_vectors != 0:
+        raise ValueError(
+            "cuVS IVF_PQ integration requires vector dimension to be divisible by "
+            "num_sub_vectors"
+        )
+
+    if dataset.schema.field(column).nullable and filter_nan:
+        filt = f"{column} is not null"
+    else:
+        filt = None
+
+    num_rows = dataset.count_rows(filter=filt)
+    if num_rows == 0:
+        raise ValueError("cuVS training requires at least one non-null training vector")
+
+    train_rows = max(1, min(num_rows, max(num_partitions * sample_rate, 256 * 256)))
+    trainset = dataset.sample(
+        train_rows,
+        columns=[column],
+        filter=filt,
+        randomize_order=True,
+    )
+    matrix = _column_to_numpy(trainset, column)
+
+    ivf_pq = _require_cuvs()
+    build_params = ivf_pq.IndexParams(
+        n_lists=num_partitions,
+        metric=_metric_to_cuvs(metric_type),
+        kmeans_n_iters=max_iters,
+        kmeans_trainset_fraction=_estimate_trainset_fraction(
+            matrix.shape[0], num_partitions, sample_rate
+        ),
+        pq_bits=num_bits,
+        pq_dim=num_sub_vectors,
+        codebook_kind="subspace",
+        force_random_rotation=False,
+        add_data_on_build=False,
+    )
+
+    index = ivf_pq.build(build_params, matrix)
+
+    centroids = _normalize_centroids(index, num_partitions, dimension)
+    pq_codebook = _normalize_pq_codebook(index, num_sub_vectors, num_bits, dimension)
+    return centroids, pq_codebook
+
+
+def one_pass_train_ivf_pq_on_cuvs(
+    dataset,
+    column: str,
+    num_partitions: int,
+    metric_type: str,
+    accelerator: str,
+    num_sub_vectors: int,
+    *,
+    sample_rate: int = 256,
+    max_iters: int = 50,
+    num_bits: int = 8,
+    filter_nan: bool = True,
+):
+    return train_ivf_pq_on_cuvs(
+        dataset,
+        column,
+        num_partitions,
+        metric_type,
+        accelerator,
+        num_sub_vectors,
+        sample_rate=sample_rate,
+        max_iters=max_iters,
+        num_bits=num_bits,
+        filter_nan=filter_nan,
+    )
+
+
+def prepare_global_ivf_pq_on_cuvs(
+    dataset,
+    column: str,
+    num_partitions: int,
+    num_sub_vectors: int,
+    *,
+    distance_type: str = "l2",
+    accelerator: str = "cuvs",
+    sample_rate: int = 256,
+    max_iters: int = 50,
+    num_bits: int = 8,
+):
+    centroids, pq_codebook = train_ivf_pq_on_cuvs(
+        dataset,
+        column,
+        num_partitions,
+        distance_type,
+        accelerator,
+        num_sub_vectors,
+        sample_rate=sample_rate,
+        max_iters=max_iters,
+        num_bits=num_bits,
+    )
+    return {"ivf_centroids": centroids, "pq_codebook": pq_codebook}
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
index 7496746285a..a5e1681b250 100644
--- a/python/python/lance/dataset.py
+++ b/python/python/lance/dataset.py
@@ -39,6 +39,7 @@
 from lance.log import LOGGER
 
 from .blob import BlobFile
+from .cuvs import is_cuvs_accelerator
 from .dependencies import (
     _check_for_numpy,
     _check_for_torch,
@@ -2899,20 +2900,24 @@ def _create_index_impl(
 
         # Handle timing for various parts of accelerated builds
         timers = {}
+        use_cuvs = is_cuvs_accelerator(accelerator)
         if accelerator is not None and index_type != "IVF_PQ":
+            if use_cuvs:
+                raise ValueError(
+                    f"accelerator='{accelerator}' only supports IVF_PQ index builds"
+                )
             LOGGER.warning(
                 "Index type %s does not support GPU acceleration; falling back to CPU",
                 index_type,
             )
             accelerator = None
+            use_cuvs = False
 
         # IMPORTANT: Distributed indexing is CPU-only. Enforce single-node when
-        # accelerator or torch-related paths are detected.
-        torch_detected = False
+        # any Python-side accelerator path is selected.
+        accelerated_build_detected = accelerator is not None
         try:
-            if accelerator is not None:
-                torch_detected = True
-            else:
+            if accelerator is None:
                 impl = kwargs.get("implementation")
                 use_torch_flag = kwargs.get("use_torch") is True
                 one_pass_flag = kwargs.get("one_pass_ivfpq") is True
@@ -2925,16 +2930,16 @@ def _create_index_impl(
                     or torch_centroids
                     or torch_codebook
                 ):
-                    torch_detected = True
+                    accelerated_build_detected = True
         except Exception:
             # Be conservative: if detection fails, do not modify behavior
             pass
 
-        if torch_detected:
+        if accelerated_build_detected:
             if require_commit:
                 if fragment_ids is not None or index_uuid is not None:
                     LOGGER.info(
-                        "Torch detected; "
+                        "Accelerated build detected; "
                         "enforce single-node indexing (distributed is CPU-only)."
                     )
                 fragment_ids = None
@@ -2942,63 +2947,92 @@ def _create_index_impl(
             else:
                 if index_uuid is not None:
                     LOGGER.info(
-                        "Torch detected; "
+                        "Accelerated build detected; "
                         "enforce single-node indexing (distributed is CPU-only)."
                     )
                 index_uuid = None
 
         if accelerator is not None:
-            from .vector import (
-                one_pass_assign_ivf_pq_on_accelerator,
-                one_pass_train_ivf_pq_on_accelerator,
-            )
-
-            LOGGER.info("Doing one-pass ivfpq accelerated computations")
             if num_partitions is None:
                 num_rows = self.count_rows()
                 num_partitions = _target_partition_size_to_num_partitions(
                     num_rows, target_partition_size
                 )
-            timers["ivf+pq_train:start"] = time.time()
-            (
-                ivf_centroids,
-                ivf_kmeans,
-                pq_codebook,
-                pq_kmeans_list,
-            ) = one_pass_train_ivf_pq_on_accelerator(
-                self,
-                column[0],
-                num_partitions,
-                metric,
-                accelerator,
-                num_sub_vectors=num_sub_vectors,
-                batch_size=20480,
-                filter_nan=filter_nan,
-            )
-            timers["ivf+pq_train:end"] = time.time()
-            ivfpq_train_time = timers["ivf+pq_train:end"] - timers["ivf+pq_train:start"]
-            LOGGER.info("ivf+pq training time: %ss", ivfpq_train_time)
-            timers["ivf+pq_assign:start"] = time.time()
-            shuffle_output_dir, shuffle_buffers = one_pass_assign_ivf_pq_on_accelerator(
-                self,
-                column[0],
-                metric,
-                accelerator,
-                ivf_kmeans,
-                pq_kmeans_list,
-                batch_size=20480,
-                filter_nan=filter_nan,
-            )
-            timers["ivf+pq_assign:end"] = time.time()
-            ivfpq_assign_time = (
-                timers["ivf+pq_assign:end"] - timers["ivf+pq_assign:start"]
-            )
-            LOGGER.info("ivf+pq transform time: %ss", ivfpq_assign_time)
 
-            kwargs["precomputed_shuffle_buffers"] = shuffle_buffers
-            kwargs["precomputed_shuffle_buffers_path"] = os.path.join(
-                shuffle_output_dir, "data"
-            )
+            if use_cuvs:
+                from .cuvs import one_pass_train_ivf_pq_on_cuvs
+
+                LOGGER.info("Doing one-pass ivfpq cuVS training")
+                timers["ivf+pq_train:start"] = time.time()
+                ivf_centroids, pq_codebook = one_pass_train_ivf_pq_on_cuvs(
+                    self,
+                    column[0],
+                    num_partitions,
+                    metric,
+                    accelerator,
+                    num_sub_vectors=num_sub_vectors,
+                    sample_rate=kwargs.get("sample_rate", 256),
+                    max_iters=kwargs.get("max_iters", 50),
+                    num_bits=kwargs.get("num_bits", 8),
+                    filter_nan=filter_nan,
+                )
+                timers["ivf+pq_train:end"] = time.time()
+                ivfpq_train_time = (
+                    timers["ivf+pq_train:end"] - timers["ivf+pq_train:start"]
+                )
+                LOGGER.info("cuVS ivf+pq training time: %ss", ivfpq_train_time)
+            else:
+                from .vector import (
+                    one_pass_assign_ivf_pq_on_accelerator,
+                    one_pass_train_ivf_pq_on_accelerator,
+                )
+
+                LOGGER.info("Doing one-pass ivfpq accelerated computations")
+                timers["ivf+pq_train:start"] = time.time()
+                (
+                    ivf_centroids,
+                    ivf_kmeans,
+                    pq_codebook,
+                    pq_kmeans_list,
+                ) = one_pass_train_ivf_pq_on_accelerator(
+                    self,
+                    column[0],
+                    num_partitions,
+                    metric,
+                    accelerator,
+                    num_sub_vectors=num_sub_vectors,
+                    batch_size=20480,
+                    filter_nan=filter_nan,
+                )
+                timers["ivf+pq_train:end"] = time.time()
+                ivfpq_train_time = (
+                    timers["ivf+pq_train:end"] - timers["ivf+pq_train:start"]
+                )
+                LOGGER.info("ivf+pq training time: %ss", ivfpq_train_time)
+                timers["ivf+pq_assign:start"] = time.time()
+                (
+                    shuffle_output_dir,
+                    shuffle_buffers,
+                ) = one_pass_assign_ivf_pq_on_accelerator(
+                    self,
+                    column[0],
+                    metric,
+                    accelerator,
+                    ivf_kmeans,
+                    pq_kmeans_list,
+                    batch_size=20480,
+                    filter_nan=filter_nan,
+                )
+                timers["ivf+pq_assign:end"] = time.time()
+                ivfpq_assign_time = (
+                    timers["ivf+pq_assign:end"] - timers["ivf+pq_assign:start"]
+                )
+                LOGGER.info("ivf+pq transform time: %ss", ivfpq_assign_time)
+
+                kwargs["precomputed_shuffle_buffers"] = shuffle_buffers
+                kwargs["precomputed_shuffle_buffers_path"] = os.path.join(
+                    shuffle_output_dir, "data"
+                )
         if index_type.startswith("IVF"):
             if (ivf_centroids is not None) and (ivf_centroids_file is not None):
                 raise ValueError(
diff --git a/python/python/lance/indices/builder.py b/python/python/lance/indices/builder.py
index c31ea0a7a0c..00591ead934 100644
--- a/python/python/lance/indices/builder.py
+++ b/python/python/lance/indices/builder.py
@@ -9,6 +9,7 @@
 import numpy as np
 import pyarrow as pa
 
+from lance.cuvs import is_cuvs_accelerator, prepare_global_ivf_pq_on_cuvs
 from lance.indices.ivf import IvfModel
 from lance.indices.pq import PqModel
 
@@ -115,6 +116,11 @@ def train_ivf(
         self._verify_ivf_sample_rate(sample_rate, num_partitions, num_rows)
         distance_type = self._normalize_distance_type(distance_type)
         self._verify_ivf_params(num_partitions)
+        if is_cuvs_accelerator(accelerator):
+            raise NotImplementedError(
+                "IndicesBuilder.train_ivf does not support accelerator='cuvs'; "
+                "use prepare_global_ivf_pq instead"
+            )
 
         if accelerator is None:
             from lance.lance import indices
@@ -250,6 +256,25 @@ def prepare_global_ivf_pq(
         `IndicesBuilder.train_pq` (indices.train_pq_model). No public method
         names elsewhere are changed.
         """
+        if is_cuvs_accelerator(accelerator):
+            if fragment_ids is not None:
+                raise NotImplementedError(
+                    "fragment_ids is not supported with accelerator='cuvs'"
+                )
+            num_rows = self._count_rows()
+            num_partitions = self._determine_num_partitions(num_partitions, num_rows)
+            num_subvectors = self._normalize_pq_params(num_subvectors, self.dimension)
+            return prepare_global_ivf_pq_on_cuvs(
+                self.dataset,
+                self.column[0],
+                num_partitions,
+                num_subvectors,
+                distance_type=distance_type,
+                accelerator=accelerator,
+                sample_rate=sample_rate,
+                max_iters=max_iters,
+            )
+
         # Global IVF training
         ivf_model = self.train_ivf(
             num_partitions,
diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py
index b20ffc8cf7a..54c6003c278 100644
--- a/python/python/tests/test_vector_index.py
+++ b/python/python/tests/test_vector_index.py
@@ -9,10 +9,12 @@
 import string
 import tempfile
 import time
+from importlib import import_module
 from pathlib import Path
 from typing import Optional
 
 import lance
+import lance.cuvs as lance_cuvs
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
@@ -505,6 +507,15 @@ def test_create_index_unsupported_accelerator(tmp_path):
             accelerator="cuda:abc",
         )
 
+    with pytest.raises(ValueError):
+        dataset.create_index(
+            "vector",
+            index_type="IVF_PQ",
+            num_partitions=4,
+            num_sub_vectors=16,
+            accelerator="cuvs:0",
+        )
+
 
 def test_create_index_accelerator_fallback(tmp_path, caplog):
     tbl = create_table()
@@ -526,6 +537,124 @@ def test_create_index_accelerator_fallback(tmp_path, caplog):
     )
 
 
+def test_create_index_cuvs_dispatch(tmp_path, monkeypatch):
+    tbl = create_table(nvec=512, ndim=128)
+    dataset = lance.write_dataset(tbl, tmp_path)
+    calls = {}
+
+    def fake_train(
+        dataset_arg,
+        column,
+        num_partitions,
+        metric_type,
+        accelerator,
+        num_sub_vectors,
+        *,
+        sample_rate,
+        max_iters,
+        num_bits,
+        filter_nan,
+    ):
+        calls["dataset"] = dataset_arg
+        calls["column"] = column
+        calls["num_partitions"] = num_partitions
+        calls["metric_type"] = metric_type
+        calls["accelerator"] = accelerator
+        calls["num_sub_vectors"] = num_sub_vectors
+        calls["sample_rate"] = sample_rate
+        calls["max_iters"] = max_iters
+        calls["num_bits"] = num_bits
+        calls["filter_nan"] = filter_nan
+        return (
+            np.random.randn(num_partitions, 128).astype(np.float32),
+            np.random.randn(num_sub_vectors, 256, 128 // num_sub_vectors).astype(
+                np.float32
+            ),
+        )
+
+    monkeypatch.setattr(lance_cuvs, "one_pass_train_ivf_pq_on_cuvs", fake_train)
+
+    dataset = dataset.create_index(
+        "vector",
+        index_type="IVF_PQ",
+        num_partitions=4,
+        num_sub_vectors=16,
+        accelerator="cuvs",
+    )
+
+    assert calls["column"] == "vector"
+    assert calls["num_partitions"] == 4
+    assert calls["metric_type"] == "L2"
+    assert calls["accelerator"] == "cuvs"
+    assert calls["num_sub_vectors"] == 16
+    assert dataset.stats.index_stats("vector_idx")["index_type"] == "IVF_PQ"
+
+
+def test_create_index_cuvs_rejects_non_ivf_pq(tmp_path):
+    tbl = create_table()
+    dataset = lance.write_dataset(tbl, tmp_path)
+
+    with pytest.raises(ValueError, match="only supports IVF_PQ"):
+        dataset.create_index(
+            "vector",
+            index_type="IVF_FLAT",
+            num_partitions=4,
+            accelerator="cuvs",
+        )
+
+
+def test_prepare_global_ivf_pq_cuvs_dispatch(tmp_path, monkeypatch):
+    ds = _make_sample_dataset_base(tmp_path, "cuvs_prepare_ds", 512, 128)
+    builder = IndicesBuilder(ds, "vector")
+    builder_module = import_module("lance.indices.builder")
+    calls = {}
+
+    def fake_prepare(
+        dataset_arg,
+        column,
+        num_partitions,
+        num_sub_vectors,
+        *,
+        distance_type,
+        accelerator,
+        sample_rate,
+        max_iters,
+    ):
+        calls["dataset"] = dataset_arg
+        calls["column"] = column
+        calls["num_partitions"] = num_partitions
+        calls["num_sub_vectors"] = num_sub_vectors
+        calls["distance_type"] = distance_type
+        calls["accelerator"] = accelerator
+        calls["sample_rate"] = sample_rate
+        calls["max_iters"] = max_iters
+        return {
+            "ivf_centroids": np.random.randn(num_partitions, 128).astype(np.float32),
+            "pq_codebook": np.random.randn(
+                num_sub_vectors, 256, 128 // num_sub_vectors
+            ).astype(np.float32),
+        }
+
+    monkeypatch.setattr(builder_module, "prepare_global_ivf_pq_on_cuvs", fake_prepare)
+
+    prepared = builder.prepare_global_ivf_pq(
+        num_partitions=4,
+        num_subvectors=16,
+        distance_type="l2",
+        accelerator="cuvs",
+        sample_rate=7,
+        max_iters=20,
+    )
+
+    assert calls["column"] == "vector"
+    assert calls["num_partitions"] == 4
+    assert calls["num_sub_vectors"] == 16
+    assert calls["distance_type"] == "l2"
+    assert calls["accelerator"] == "cuvs"
+    assert prepared["ivf_centroids"].shape == (4, 128)
+    assert prepared["pq_codebook"].shape == (16, 256, 8)
+
+
 def test_use_index(dataset, tmp_path):
     ann_ds = lance.write_dataset(dataset.to_table(), tmp_path / "indexed.lance")
     ann_ds = ann_ds.create_index(

From 2f071f6a28fa399df7e19cd551adcb5dc663f5f3 Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Wed, 1 Apr 2026 02:05:54 +0800
Subject: [PATCH 02/21] python: document cuvs installation requirements

---
 python/DEVELOPMENT.md          | 16 ++++++++++++++++
 python/python/lance/cuvs.py    |  4 +++-
 python/python/lance/dataset.py | 20 ++++++++++++++------
 3 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/python/DEVELOPMENT.md b/python/DEVELOPMENT.md
index 12c56549608..21dba0bdddd 100644
--- a/python/DEVELOPMENT.md
+++ b/python/DEVELOPMENT.md
@@ -8,6 +8,22 @@ uv sync --extra tests --extra dev
 
 Add extras such as `benchmarks`, `torch`, or `geo` only when you need them. After the environment is initialized, either activate it or use `uv run ...` for commands.
 
+`accelerator="cuvs"` does not have a normal project extra today. cuVS Python
+packages are published per CUDA major version and are typically installed from
+NVIDIA's package index, for example:
+
+```shell
+uv pip install --extra-index-url https://pypi.nvidia.com cuvs-cu12
+```
+
+or:
+
+```shell
+uv pip install --extra-index-url https://pypi.nvidia.com cuvs-cu13
+```
+
+Pick the package that matches the CUDA version in your environment.
+
 `uv sync` is not just downloading Python packages here. It also builds the local `pylance` Rust extension as part of the editable environment, so the first run, cache misses, or Rust dependency changes can make it noticeably slow. This is expected; let the build finish instead of interrupting it and switching to a different environment setup.
 
 ## Building the project
diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py
index ab46eb8a432..6c0a4085c5c 100644
--- a/python/python/lance/cuvs.py
+++ b/python/python/lance/cuvs.py
@@ -20,7 +20,9 @@ def _require_cuvs():
         return import_module("cuvs.neighbors.ivf_pq")
     except ModuleNotFoundError as exc:
         raise ModuleNotFoundError(
-            "accelerator='cuvs' requires the 'cuvs' package to be installed"
+            "accelerator='cuvs' requires cuVS Python bindings to be installed. "
+            "Install a CUDA-matched package such as 'cuvs-cu12' or 'cuvs-cu13' "
+            "from https://pypi.nvidia.com."
         ) from exc
 
 
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
index a5e1681b250..cda142f5bb4 100644
--- a/python/python/lance/dataset.py
+++ b/python/python/lance/dataset.py
@@ -3263,7 +3263,12 @@ def create_index(
             The number of sub-vectors for PQ (Product Quantization).
         accelerator : str or ``torch.Device``, optional
             If set, use an accelerator to speed up the training process.
-            Accepted accelerator: "cuda" (Nvidia GPU) and "mps" (Apple Silicon GPU).
+            Accepted accelerator:
+
+            - "cuda" or ``torch.device(...)`` for the existing torch-based path
+            - "mps" for Apple Silicon GPU
+            - "cuvs" for the explicit cuVS-based IVF_PQ training path
+
             If not set, use the CPU.
         index_cache_size : int, optional
             The size of the index cache in number of entries. Default value is 256.
@@ -3372,8 +3377,10 @@ def create_index(
         Experimental Accelerator (GPU) support:
 
         - *accelerate*: use GPU to train IVF partitions.
-            Only supports CUDA (Nvidia) or MPS (Apple) currently.
-            Requires PyTorch being installed.
+            `accelerator="cuda"` and `accelerator="mps"` use the existing torch path.
+            `accelerator="cuvs"` uses cuVS for IVF_PQ training only.
+            The torch path requires PyTorch. The cuVS path requires the cuVS Python
+            bindings to be installed separately.
 
         .. code-block:: python
 
@@ -3388,9 +3395,10 @@ def create_index(
                 accelerator="cuda"
             )
 
-        Note: GPU acceleration is currently supported only for the ``IVF_PQ`` index
-        type. Providing an accelerator for other index types will fall back to CPU
-        index building.
+        Note: accelerator support is currently limited to the ``IVF_PQ`` index type.
+        Providing ``accelerator="cuda"`` for other index types will fall back to CPU
+        index building. Providing ``accelerator="cuvs"`` for other index types will
+        raise an error.
 
         References
         ----------

From 1a6c44b7a1630d6ab8fe14aeea8dcce2d6c8bf99 Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Wed, 1 Apr 2026 14:17:05 +0800
Subject: [PATCH 03/21] python: fix cuvs training on real datasets

---
 python/python/lance/cuvs.py              | 26 ++++++++++---
 python/python/tests/test_vector_index.py | 48 ++++++++++++++++++++++++
 2 files changed, 68 insertions(+), 6 deletions(-)

diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py
index 6c0a4085c5c..0bfa910cb21 100644
--- a/python/python/lance/cuvs.py
+++ b/python/python/lance/cuvs.py
@@ -7,6 +7,7 @@
 from typing import Tuple
 
 import pyarrow as pa
+import pyarrow.compute as pc
 
 from .dependencies import numpy as np
 
@@ -64,6 +65,10 @@ def _column_to_numpy(table: pa.Table, column: str) -> np.ndarray:
 def _as_numpy(array_like) -> np.ndarray:
     if isinstance(array_like, np.ndarray):
         return array_like
+
+    if hasattr(array_like, "copy_to_host"):
+        return np.asarray(array_like.copy_to_host())
+
     try:
         array = np.asarray(array_like)
         if isinstance(array, np.ndarray):
@@ -122,6 +127,20 @@ def _estimate_trainset_fraction(
     return min(1.0, desired_rows / num_rows)
 
 
+def _sample_training_table(dataset, column: str, train_rows: int, filt: str | None) -> pa.Table:
+    if filt is None:
+        return dataset.sample(train_rows, columns=[column], randomize_order=True)
+
+    total_rows = dataset.count_rows()
+    sample_rows = min(total_rows, max(train_rows * 2, train_rows + 1024))
+    trainset = dataset.sample(sample_rows, columns=[column], randomize_order=True)
+    trainset = trainset.filter(pc.is_valid(trainset.column(column)))
+    if len(trainset) >= train_rows or sample_rows == total_rows:
+        return trainset.slice(0, min(train_rows, len(trainset)))
+
+    return dataset.to_table(columns=[column], filter=filt, limit=train_rows)
+
+
 def train_ivf_pq_on_cuvs(
     dataset,
     column: str,
@@ -157,12 +176,7 @@ def train_ivf_pq_on_cuvs(
         raise ValueError("cuVS training requires at least one non-null training vector")
 
     train_rows = max(1, min(num_rows, max(num_partitions * sample_rate, 256 * 256)))
-    trainset = dataset.sample(
-        train_rows,
-        columns=[column],
-        filter=filt,
-        randomize_order=True,
-    )
+    trainset = _sample_training_table(dataset, column, train_rows, filt)
     matrix = _column_to_numpy(trainset, column)
 
     ivf_pq = _require_cuvs()
diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py
index 54c6003c278..b92952e5f3c 100644
--- a/python/python/tests/test_vector_index.py
+++ b/python/python/tests/test_vector_index.py
@@ -655,6 +655,54 @@ def fake_prepare(
     assert prepared["pq_codebook"].shape == (16, 256, 8)
 
 
+def test_train_ivf_pq_on_cuvs_nullable_vectors(tmp_path, monkeypatch):
+    tbl = create_table(nvec=32, ndim=16, nullify=True)
+    dataset = lance.write_dataset(tbl, tmp_path)
+
+    class FakeIndex:
+        centers = np.random.randn(4, 16).astype(np.float32)
+        pq_centers = np.random.randn(4, 256, 4).astype(np.float32)
+
+    class FakeIvfPqModule:
+        class IndexParams:
+            def __init__(self, **kwargs):
+                self.kwargs = kwargs
+
+        @staticmethod
+        def build(build_params, matrix):
+            assert build_params.kwargs["n_lists"] == 4
+            assert matrix.shape[1] == 16
+            assert matrix.dtype == np.float32
+            return FakeIndex()
+
+    monkeypatch.setattr(lance_cuvs, "_require_cuvs", lambda: FakeIvfPqModule())
+
+    centroids, pq_codebook = lance_cuvs.train_ivf_pq_on_cuvs(
+        dataset,
+        "vector",
+        4,
+        "L2",
+        "cuvs",
+        4,
+        sample_rate=4,
+    )
+
+    assert centroids.shape == (4, 16)
+    assert pq_codebook.shape == (4, 256, 4)
+
+
+def test_cuvs_as_numpy_prefers_copy_to_host():
+    class FakeDeviceTensor:
+        def copy_to_host(self):
+            return np.arange(6, dtype=np.float32).reshape(2, 3)
+
+    array = lance_cuvs._as_numpy(FakeDeviceTensor())
+
+    assert isinstance(array, np.ndarray)
+    assert array.shape == (2, 3)
+    assert array.dtype == np.float32
+
+
 def test_use_index(dataset, tmp_path):
     ann_ds = lance.write_dataset(dataset.to_table(), tmp_path / "indexed.lance")
     ann_ds = ann_ds.create_index(

From 76995c0357d5f3e6c1dc55c4cfc26ac52a012dd7 Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Wed, 1 Apr 2026 17:03:47 +0800
Subject: [PATCH 04/21] python: format cuvs helper

---
 python/python/lance/cuvs.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py
index 0bfa910cb21..5c58af9552b 100644
--- a/python/python/lance/cuvs.py
+++ b/python/python/lance/cuvs.py
@@ -127,7 +127,9 @@ def _estimate_trainset_fraction(
     return min(1.0, desired_rows / num_rows)
 
 
-def _sample_training_table(dataset, column: str, train_rows: int, filt: str | None) -> pa.Table:
+def _sample_training_table(
+    dataset, column: str, train_rows: int, filt: str | None
+) -> pa.Table:
     if filt is None:
         return dataset.sample(train_rows, columns=[column], randomize_order=True)
 

From fbe0f50faf39f3c92365b534697f00dca0c4fbe6 Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Wed, 1 Apr 2026 17:12:33 +0800
Subject: [PATCH 05/21] python: clarify accelerator hardware requirements

---
 python/python/lance/dataset.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
index cda142f5bb4..baa9890daf7 100644
--- a/python/python/lance/dataset.py
+++ b/python/python/lance/dataset.py
@@ -3266,8 +3266,13 @@ def create_index(
             Accepted accelerator:
 
             - "cuda" or ``torch.device(...)`` for the existing torch-based path
+              on NVIDIA GPUs
             - "mps" for Apple Silicon GPU
-            - "cuvs" for the explicit cuVS-based IVF_PQ training path
+            - "cuvs" for the explicit cuVS-based IVF_PQ training path on NVIDIA
+              GPUs
+
+            The cuVS path also requires the cuVS Python bindings to be installed
+            separately.
 
             If not set, use the CPU.
         index_cache_size : int, optional
@@ -3377,10 +3382,13 @@ def create_index(
         Experimental Accelerator (GPU) support:
 
         - *accelerate*: use GPU to train IVF partitions.
-            `accelerator="cuda"` and `accelerator="mps"` use the existing torch path.
-            `accelerator="cuvs"` uses cuVS for IVF_PQ training only.
-            The torch path requires PyTorch. The cuVS path requires the cuVS Python
-            bindings to be installed separately.
+            `accelerator="cuda"` and `accelerator="mps"` use the existing torch
+            path. `accelerator="cuda"` runs on NVIDIA GPUs and `accelerator="mps"`
+            runs on Apple Silicon GPUs. `accelerator="cuvs"` uses cuVS for IVF_PQ
+            training only and requires an NVIDIA GPU.
+
+            The torch path requires PyTorch. The cuVS path requires the cuVS
+            Python bindings to be installed separately.
 
         .. code-block:: python
 

From f00d0783e221b27e05a8a4331f2af4d76ea93c2a Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Wed, 1 Apr 2026 19:15:25 +0800
Subject: [PATCH 06/21] python: add cuvs one-pass ivfpq assignment

---
 python/python/lance/cuvs.py              | 248 ++++++++++++++++++++++-
 python/python/lance/dataset.py           |  28 ++-
 python/python/tests/test_vector_index.py |  70 +++++++
 3 files changed, 337 insertions(+), 9 deletions(-)

diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py
index 5c58af9552b..45142b6b774 100644
--- a/python/python/lance/cuvs.py
+++ b/python/python/lance/cuvs.py
@@ -3,13 +3,20 @@
 
 from __future__ import annotations
 
+import re
+import tempfile
 from importlib import import_module
-from typing import Tuple
+from typing import TYPE_CHECKING, Iterator, Tuple
 
 import pyarrow as pa
 import pyarrow.compute as pc
 
 from .dependencies import numpy as np
+from .log import LOGGER
+from .util import _normalize_metric_type
+
+if TYPE_CHECKING:
+    from pathlib import Path
 
 
 def is_cuvs_accelerator(accelerator: object) -> bool:
@@ -34,8 +41,33 @@ def _optional_cupy():
         return None
 
 
+def _xp_module():
+    cupy = _optional_cupy()
+    return cupy if cupy is not None else np
+
+
+def _make_progress(total: int):
+    try:
+        from tqdm.auto import tqdm
+
+        return tqdm(total=total)
+    except ModuleNotFoundError:
+
+        class _NoOpProgress:
+            def set_description(self, _description: str):
+                return None
+
+            def update(self, _count: int):
+                return None
+
+            def close(self):
+                return None
+
+        return _NoOpProgress()
+
+
 def _metric_to_cuvs(metric_type: str) -> str:
-    metric_type = metric_type.lower()
+    metric_type = _normalize_metric_type(metric_type).lower()
     if metric_type in {"l2", "euclidean"}:
         return "sqeuclidean"
     if metric_type == "dot":
@@ -45,12 +77,7 @@ def _metric_to_cuvs(metric_type: str) -> str:
     raise ValueError(f"Metric '{metric_type}' is not supported by cuVS IVF_PQ")
 
 
-def _column_to_numpy(table: pa.Table, column: str) -> np.ndarray:
-    array = table.column(column).combine_chunks()
-    values = array.to_pylist()
-    if len(values) == 0:
-        raise ValueError("cuVS training requires at least one training vector")
-    matrix = np.asarray(values)
+def _coerce_float_matrix(matrix: np.ndarray, *, column: str) -> np.ndarray:
     if matrix.ndim != 2:
         raise ValueError(
             f"Expected a 2D training matrix for column '{column}', got {matrix.shape}"
@@ -62,6 +89,22 @@ def _column_to_numpy(table: pa.Table, column: str) -> np.ndarray:
     return matrix
 
 
+def _column_to_numpy(table: pa.Table | pa.RecordBatch, column: str) -> np.ndarray:
+    array = table.column(column)
+    if isinstance(array, pa.ChunkedArray):
+        array = array.combine_chunks()
+    if len(array) == 0:
+        raise ValueError("cuVS training requires at least one training vector")
+
+    if pa.types.is_fixed_size_list(array.type):
+        values = array.values.to_numpy(zero_copy_only=False)
+        matrix = values.reshape(len(array), array.type.list_size)
+        return _coerce_float_matrix(matrix, column=column)
+
+    values = array.to_pylist()
+    return _coerce_float_matrix(np.asarray(values), column=column)
+
+
 def _as_numpy(array_like) -> np.ndarray:
     if isinstance(array_like, np.ndarray):
         return array_like
@@ -143,6 +186,195 @@ def _sample_training_table(
     return dataset.to_table(columns=[column], filter=filt, limit=train_rows)
 
 
+def _normalize_metric(metric_type: str) -> str:
+    return _normalize_metric_type(metric_type).lower()
+
+
+def _backend_asarray(array_like, xp):
+    if xp is np:
+        return np.asarray(array_like)
+    return xp.asarray(array_like)
+
+
+def _backend_to_numpy(array_like, xp) -> np.ndarray:
+    if xp is np:
+        return np.asarray(array_like)
+    return xp.asnumpy(array_like)
+
+
+def _normalize_rows(matrix, xp):
+    eps = xp.finfo(matrix.dtype).eps
+    norms = xp.linalg.norm(matrix, axis=1, keepdims=True)
+    return matrix / xp.maximum(norms, eps)
+
+
+def _argmin_distance(vectors, centroids, metric_type: str, xp):
+    if vectors.shape[0] == 0:
+        return xp.empty((0,), dtype=xp.int32)
+
+    metric_type = _normalize_metric(metric_type)
+    if metric_type in {"l2", "euclidean"}:
+        vec_norms = xp.sum(vectors * vectors, axis=1, keepdims=True)
+        ctr_norms = xp.sum(centroids * centroids, axis=1, keepdims=False)
+        distances = vec_norms + ctr_norms - 2 * vectors @ centroids.T
+        return xp.argmin(distances, axis=1).astype(xp.int32, copy=False)
+
+    if metric_type == "dot":
+        scores = vectors @ centroids.T
+        return xp.argmax(scores, axis=1).astype(xp.int32, copy=False)
+
+    if metric_type == "cosine":
+        scores = _normalize_rows(vectors, xp) @ _normalize_rows(centroids, xp).T
+        return xp.argmax(scores, axis=1).astype(xp.int32, copy=False)
+
+    raise ValueError(f"Metric '{metric_type}' is not supported by cuVS IVF_PQ")
+
+
+def _encode_pq_codes(residuals, pq_codebook, metric_type: str, xp) -> np.ndarray:
+    num_rows, num_sub_vectors, _ = residuals.shape
+    codes = np.empty((num_rows, num_sub_vectors), dtype=np.uint8)
+    for subvector_idx in range(num_sub_vectors):
+        sub_vectors = residuals[:, subvector_idx, :]
+        sub_codebook = pq_codebook[subvector_idx]
+        nearest = _argmin_distance(sub_vectors, sub_codebook, metric_type, xp)
+        codes[:, subvector_idx] = _backend_to_numpy(nearest, xp).astype(
+            np.uint8, copy=False
+        )
+    return codes
+
+
+def _make_shuffle_batch(
+    row_ids: np.ndarray,
+    partitions: np.ndarray,
+    pq_codes: np.ndarray,
+    num_sub_vectors: int,
+) -> pa.RecordBatch:
+    pq_values = pa.array(pq_codes.reshape(-1))
+    pq_code_array = pa.FixedSizeListArray.from_arrays(pq_values, num_sub_vectors)
+    return pa.RecordBatch.from_arrays(
+        [
+            pa.array(row_ids, type=pa.uint64()),
+            pa.array(partitions, type=pa.uint32()),
+            pq_code_array,
+        ],
+        schema=pa.schema(
+            [
+                pa.field("row_id", pa.uint64()),
+                pa.field("__ivf_part_id", pa.uint32()),
+                pa.field("__pq_code", pa.list_(pa.uint8(), list_size=num_sub_vectors)),
+            ]
+        ),
+    )
+
+
+def one_pass_assign_ivf_pq_on_cuvs(
+    dataset,
+    column: str,
+    metric_type: str,
+    accelerator: str,
+    ivf_centroids: np.ndarray,
+    pq_codebook: np.ndarray,
+    dst_dataset_uri: str | Path | None = None,
+    batch_size: int = 1024 * 10 * 4,
+    *,
+    filter_nan: bool = True,
+):
+    if accelerator != "cuvs":
+        raise ValueError("cuVS acceleration only supports accelerator='cuvs'")
+
+    num_rows = dataset.count_rows()
+    if dataset.schema.field(column).nullable and filter_nan:
+        filt = f"{column} is not null"
+    else:
+        filt = None
+
+    num_sub_vectors = pq_codebook.shape[0]
+    subvector_size = pq_codebook.shape[2]
+    dim = ivf_centroids.shape[1]
+    if dim != num_sub_vectors * subvector_size:
+        raise ValueError(
+            "cuVS returned incompatible IVF/PQ dimensions: "
+            f"centroids dim {dim} != {num_sub_vectors} * {subvector_size}"
+        )
+
+    xp = _xp_module()
+    backend_centroids = _backend_asarray(ivf_centroids, xp)
+    backend_codebook = _backend_asarray(pq_codebook, xp)
+
+    progress = _make_progress(num_rows)
+    progress.set_description("Assigning partitions and computing pq codes")
+
+    def _partition_and_pq_codes_assignment() -> Iterator[pa.RecordBatch]:
+        for batch in dataset.to_batches(
+            columns=[column],
+            filter=filt,
+            with_row_id=True,
+            batch_size=batch_size,
+        ):
+            vectors = _column_to_numpy(batch, column)
+            row_ids = batch.column("_rowid").to_numpy()
+            valid_mask = np.isfinite(vectors).all(axis=1)
+            if not np.all(valid_mask):
+                LOGGER.warning(
+                    "%s vectors are ignored during partition assignment",
+                    len(valid_mask) - int(valid_mask.sum()),
+                )
+                row_ids = row_ids[valid_mask]
+                vectors = vectors[valid_mask]
+            if len(row_ids) == 0:
+                continue
+            backend_vectors = _backend_asarray(vectors, xp)
+
+            partitions = _argmin_distance(
+                backend_vectors, backend_centroids, metric_type, xp
+            )
+            selected_centroids = backend_centroids[partitions]
+            residuals = backend_vectors - selected_centroids
+            residuals = residuals.reshape(-1, num_sub_vectors, subvector_size)
+            pq_codes = _encode_pq_codes(residuals, backend_codebook, metric_type, xp)
+
+            partition_batch = _make_shuffle_batch(
+                row_ids,
+                _backend_to_numpy(partitions, xp),
+                pq_codes,
+                num_sub_vectors,
+            )
+            progress.update(partition_batch.num_rows)
+            yield partition_batch
+
+    output_schema = pa.schema(
+        [
+            pa.field("row_id", pa.uint64()),
+            pa.field("__ivf_part_id", pa.uint32()),
+            pa.field("__pq_code", pa.list_(pa.uint8(), list_size=num_sub_vectors)),
+        ]
+    )
+    rbr = pa.RecordBatchReader.from_batches(
+        output_schema, _partition_and_pq_codes_assignment()
+    )
+    if dst_dataset_uri is None:
+        dst_dataset_uri = tempfile.mkdtemp()
+        if re.search(r".:\\", dst_dataset_uri) is not None:
+            dst_dataset_uri = dst_dataset_uri.replace("\\", "/", 1)
+
+    from . import write_dataset
+
+    ds = write_dataset(
+        rbr,
+        dst_dataset_uri,
+        schema=output_schema,
+        data_storage_version="legacy",
+    )
+
+    progress.close()
+    LOGGER.info("Saved precomputed pq_codes to %s", dst_dataset_uri)
+
+    shuffle_buffers = [
+        data_file.path for frag in ds.get_fragments() for data_file in frag.data_files()
+    ]
+    return str(dst_dataset_uri), shuffle_buffers
+
+
 def train_ivf_pq_on_cuvs(
     dataset,
     column: str,
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
index baa9890daf7..485f4425694 100644
--- a/python/python/lance/dataset.py
+++ b/python/python/lance/dataset.py
@@ -2960,7 +2960,10 @@ def _create_index_impl(
                 )
 
             if use_cuvs:
-                from .cuvs import one_pass_train_ivf_pq_on_cuvs
+                from .cuvs import (
+                    one_pass_assign_ivf_pq_on_cuvs,
+                    one_pass_train_ivf_pq_on_cuvs,
+                )
 
                 LOGGER.info("Doing one-pass ivfpq cuVS training")
                 timers["ivf+pq_train:start"] = time.time()
@@ -2981,6 +2984,29 @@ def _create_index_impl(
                     timers["ivf+pq_train:end"] - timers["ivf+pq_train:start"]
                 )
                 LOGGER.info("cuVS ivf+pq training time: %ss", ivfpq_train_time)
+                timers["ivf+pq_assign:start"] = time.time()
+                (
+                    shuffle_output_dir,
+                    shuffle_buffers,
+                ) = one_pass_assign_ivf_pq_on_cuvs(
+                    self,
+                    column[0],
+                    metric,
+                    accelerator,
+                    ivf_centroids,
+                    pq_codebook,
+                    batch_size=20480,
+                    filter_nan=filter_nan,
+                )
+                timers["ivf+pq_assign:end"] = time.time()
+                ivfpq_assign_time = (
+                    timers["ivf+pq_assign:end"] - timers["ivf+pq_assign:start"]
+                )
+                LOGGER.info("cuVS ivf+pq transform time: %ss", ivfpq_assign_time)
+                kwargs["precomputed_shuffle_buffers"] = shuffle_buffers
+                kwargs["precomputed_shuffle_buffers_path"] = os.path.join(
+                    shuffle_output_dir, "data"
+                )
             else:
                 from .vector import (
                     one_pass_assign_ivf_pq_on_accelerator,
diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py
index b92952e5f3c..7b49137e7a1 100644
--- a/python/python/tests/test_vector_index.py
+++ b/python/python/tests/test_vector_index.py
@@ -572,7 +572,48 @@ def fake_train(
             ),
         )
 
+    def fake_assign(
+        dataset_arg,
+        column,
+        metric_type,
+        accelerator,
+        ivf_centroids,
+        pq_codebook,
+        dst_dataset_uri=None,
+        batch_size=20480,
+        *,
+        filter_nan,
+    ):
+        calls["assign_dataset"] = dataset_arg
+        calls["assign_column"] = column
+        calls["assign_metric_type"] = metric_type
+        calls["assign_accelerator"] = accelerator
+        calls["assign_batch_size"] = batch_size
+        calls["assign_filter_nan"] = filter_nan
+
+        row_ids = dataset_arg.to_table(columns=[], with_row_id=True)[
+            "_rowid"
+        ].to_numpy()
+        part_ids = pa.array(np.zeros(len(row_ids), dtype=np.uint32))
+        pq_values = pa.array(np.zeros(len(row_ids) * 16, dtype=np.uint8))
+        pq_codes = pa.FixedSizeListArray.from_arrays(pq_values, 16)
+        shuffle_ds_uri = str(tmp_path / "cuvs_shuffle_buffers")
+        shuffle_ds = lance.write_dataset(
+            pa.Table.from_arrays(
+                [pa.array(row_ids), part_ids, pq_codes],
+                names=["row_id", "__ivf_part_id", "__pq_code"],
+            ),
+            shuffle_ds_uri,
+        )
+        shuffle_buffers = [
+            data_file.path
+            for frag in shuffle_ds.get_fragments()
+            for data_file in frag.data_files()
+        ]
+        return shuffle_ds_uri, shuffle_buffers
+
     monkeypatch.setattr(lance_cuvs, "one_pass_train_ivf_pq_on_cuvs", fake_train)
+    monkeypatch.setattr(lance_cuvs, "one_pass_assign_ivf_pq_on_cuvs", fake_assign)
 
     dataset = dataset.create_index(
         "vector",
@@ -587,6 +628,9 @@ def fake_train(
     assert calls["metric_type"] == "L2"
     assert calls["accelerator"] == "cuvs"
     assert calls["num_sub_vectors"] == 16
+    assert calls["assign_column"] == "vector"
+    assert calls["assign_metric_type"] == "L2"
+    assert calls["assign_accelerator"] == "cuvs"
     assert dataset.stats.index_stats("vector_idx")["index_type"] == "IVF_PQ"
 
 
@@ -703,6 +747,32 @@ def copy_to_host(self):
     assert array.dtype == np.float32
 
 
+def test_one_pass_assign_ivf_pq_on_cuvs_writes_shuffle_buffers(tmp_path):
+    tbl = create_table(nvec=32, ndim=16)
+    dataset = lance.write_dataset(tbl, tmp_path / "cuvs_assign_src")
+
+    ivf_centroids = np.random.randn(4, 16).astype(np.float32)
+    pq_codebook = np.random.randn(4, 256, 4).astype(np.float32)
+
+    shuffle_uri, shuffle_buffers = lance_cuvs.one_pass_assign_ivf_pq_on_cuvs(
+        dataset,
+        "vector",
+        "l2",
+        "cuvs",
+        ivf_centroids,
+        pq_codebook,
+        batch_size=8,
+    )
+
+    shuffle_ds = lance.dataset(shuffle_uri)
+    batch = next(shuffle_ds.to_batches())
+
+    assert len(shuffle_buffers) > 0
+    assert batch.column("row_id").type == pa.uint64()
+    assert batch.column("__ivf_part_id").type == pa.uint32()
+    assert batch.column("__pq_code").type == pa.list_(pa.uint8(), 4)
+
+
 def test_use_index(dataset, tmp_path):
     ann_ds = lance.write_dataset(dataset.to_table(), tmp_path / "indexed.lance")
     ann_ds = ann_ds.create_index(

From 7f7e6e2e3829b45c9324ae7014c314489879eaab Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Wed, 1 Apr 2026 23:31:24 +0800
Subject: [PATCH 07/21] python: use cuvs transform for full ivf pq build

---
 python/python/lance/cuvs.py              | 219 ++++++++++-------------
 python/python/lance/dataset.py           |   5 +-
 python/python/tests/test_vector_index.py |  45 ++++-
 3 files changed, 142 insertions(+), 127 deletions(-)

diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py
index 45142b6b774..8498be7720b 100644
--- a/python/python/lance/cuvs.py
+++ b/python/python/lance/cuvs.py
@@ -41,11 +41,6 @@ def _optional_cupy():
         return None
 
 
-def _xp_module():
-    cupy = _optional_cupy()
-    return cupy if cupy is not None else np
-
-
 def _make_progress(total: int):
     try:
         from tqdm.auto import tqdm
@@ -129,6 +124,16 @@ def _as_numpy(array_like) -> np.ndarray:
     raise TypeError("Unable to convert cuVS output to numpy")
 
 
+def _to_cuvs_transform_input(matrix: np.ndarray):
+    cupy = _optional_cupy()
+    if cupy is None:
+        raise ModuleNotFoundError(
+            "accelerator='cuvs' full index build requires the 'cupy' package "
+            "to pass transform batches in device memory"
+        )
+    return cupy.asarray(matrix)
+
+
 def _normalize_centroids(index, num_partitions: int, dimension: int) -> np.ndarray:
     centroids = _as_numpy(index.centers)
     if centroids.shape != (num_partitions, dimension):
@@ -186,63 +191,6 @@ def _sample_training_table(
     return dataset.to_table(columns=[column], filter=filt, limit=train_rows)
 
 
-def _normalize_metric(metric_type: str) -> str:
-    return _normalize_metric_type(metric_type).lower()
-
-
-def _backend_asarray(array_like, xp):
-    if xp is np:
-        return np.asarray(array_like)
-    return xp.asarray(array_like)
-
-
-def _backend_to_numpy(array_like, xp) -> np.ndarray:
-    if xp is np:
-        return np.asarray(array_like)
-    return xp.asnumpy(array_like)
-
-
-def _normalize_rows(matrix, xp):
-    eps = xp.finfo(matrix.dtype).eps
-    norms = xp.linalg.norm(matrix, axis=1, keepdims=True)
-    return matrix / xp.maximum(norms, eps)
-
-
-def _argmin_distance(vectors, centroids, metric_type: str, xp):
-    if vectors.shape[0] == 0:
-        return xp.empty((0,), dtype=xp.int32)
-
-    metric_type = _normalize_metric(metric_type)
-    if metric_type in {"l2", "euclidean"}:
-        vec_norms = xp.sum(vectors * vectors, axis=1, keepdims=True)
-        ctr_norms = xp.sum(centroids * centroids, axis=1, keepdims=False)
-        distances = vec_norms + ctr_norms - 2 * vectors @ centroids.T
-        return xp.argmin(distances, axis=1).astype(xp.int32, copy=False)
-
-    if metric_type == "dot":
-        scores = vectors @ centroids.T
-        return xp.argmax(scores, axis=1).astype(xp.int32, copy=False)
-
-    if metric_type == "cosine":
-        scores = _normalize_rows(vectors, xp) @ _normalize_rows(centroids, xp).T
-        return xp.argmax(scores, axis=1).astype(xp.int32, copy=False)
-
-    raise ValueError(f"Metric '{metric_type}' is not supported by cuVS IVF_PQ")
-
-
-def _encode_pq_codes(residuals, pq_codebook, metric_type: str, xp) -> np.ndarray:
-    num_rows, num_sub_vectors, _ = residuals.shape
-    codes = np.empty((num_rows, num_sub_vectors), dtype=np.uint8)
-    for subvector_idx in range(num_sub_vectors):
-        sub_vectors = residuals[:, subvector_idx, :]
-        sub_codebook = pq_codebook[subvector_idx]
-        nearest = _argmin_distance(sub_vectors, sub_codebook, metric_type, xp)
-        codes[:, subvector_idx] = _backend_to_numpy(nearest, xp).astype(
-            np.uint8, copy=False
-        )
-    return codes
-
-
 def _make_shuffle_batch(
     row_ids: np.ndarray,
     partitions: np.ndarray,
@@ -267,6 +215,65 @@ def _make_shuffle_batch(
     )
 
 
+def _train_ivf_pq_index_on_cuvs(
+    dataset,
+    column: str,
+    num_partitions: int,
+    metric_type: str,
+    accelerator: str,
+    num_sub_vectors: int,
+    *,
+    sample_rate: int = 256,
+    max_iters: int = 50,
+    num_bits: int = 8,
+    filter_nan: bool = True,
+):
+    if accelerator != "cuvs":
+        raise ValueError("cuVS acceleration only supports accelerator='cuvs'")
+    if num_bits != 8:
+        raise ValueError("cuVS IVF_PQ integration currently supports only num_bits=8")
+
+    dimension = dataset.schema.field(column).type.list_size
+    if dimension % num_sub_vectors != 0:
+        raise ValueError(
+            "cuVS IVF_PQ integration requires vector dimension to be divisible by "
+            "num_sub_vectors"
+        )
+
+    if dataset.schema.field(column).nullable and filter_nan:
+        filt = f"{column} is not null"
+    else:
+        filt = None
+
+    num_rows = dataset.count_rows(filter=filt)
+    if num_rows == 0:
+        raise ValueError("cuVS training requires at least one non-null training vector")
+
+    train_rows = max(1, min(num_rows, max(num_partitions * sample_rate, 256 * 256)))
+    trainset = _sample_training_table(dataset, column, train_rows, filt)
+    matrix = _column_to_numpy(trainset, column)
+
+    ivf_pq = _require_cuvs()
+    build_params = ivf_pq.IndexParams(
+        n_lists=num_partitions,
+        metric=_metric_to_cuvs(metric_type),
+        kmeans_n_iters=max_iters,
+        kmeans_trainset_fraction=_estimate_trainset_fraction(
+            matrix.shape[0], num_partitions, sample_rate
+        ),
+        pq_bits=num_bits,
+        pq_dim=num_sub_vectors,
+        codebook_kind="subspace",
+        force_random_rotation=False,
+        add_data_on_build=False,
+    )
+
+    index = ivf_pq.build(build_params, matrix)
+    centroids = _normalize_centroids(index, num_partitions, dimension)
+    pq_codebook = _normalize_pq_codebook(index, num_sub_vectors, num_bits, dimension)
+    return index, centroids, pq_codebook
+
+
 def one_pass_assign_ivf_pq_on_cuvs(
     dataset,
     column: str,
@@ -274,6 +281,7 @@ def one_pass_assign_ivf_pq_on_cuvs(
     accelerator: str,
     ivf_centroids: np.ndarray,
     pq_codebook: np.ndarray,
+    trained_index=None,
     dst_dataset_uri: str | Path | None = None,
     batch_size: int = 1024 * 10 * 4,
     *,
@@ -289,18 +297,14 @@ def one_pass_assign_ivf_pq_on_cuvs(
         filt = None
 
     num_sub_vectors = pq_codebook.shape[0]
-    subvector_size = pq_codebook.shape[2]
-    dim = ivf_centroids.shape[1]
-    if dim != num_sub_vectors * subvector_size:
+    ivf_pq = _require_cuvs()
+
+    if trained_index is None:
         raise ValueError(
-            "cuVS returned incompatible IVF/PQ dimensions: "
-            f"centroids dim {dim} != {num_sub_vectors} * {subvector_size}"
+            "one_pass_assign_ivf_pq_on_cuvs requires a trained cuVS index for "
+            "single-node transform"
         )
 
-    xp = _xp_module()
-    backend_centroids = _backend_asarray(ivf_centroids, xp)
-    backend_codebook = _backend_asarray(pq_codebook, xp)
-
     progress = _make_progress(num_rows)
     progress.set_description("Assigning partitions and computing pq codes")
 
@@ -323,19 +327,20 @@ def _partition_and_pq_codes_assignment() -> Iterator[pa.RecordBatch]:
                 vectors = vectors[valid_mask]
             if len(row_ids) == 0:
                 continue
-            backend_vectors = _backend_asarray(vectors, xp)
-
-            partitions = _argmin_distance(
-                backend_vectors, backend_centroids, metric_type, xp
+            partitions, pq_codes = ivf_pq.transform(
+                trained_index, _to_cuvs_transform_input(vectors)
             )
-            selected_centroids = backend_centroids[partitions]
-            residuals = backend_vectors - selected_centroids
-            residuals = residuals.reshape(-1, num_sub_vectors, subvector_size)
-            pq_codes = _encode_pq_codes(residuals, backend_codebook, metric_type, xp)
+            partitions = _as_numpy(partitions).astype(np.uint32, copy=False)
+            pq_codes = _as_numpy(pq_codes).astype(np.uint8, copy=False)
+            if pq_codes.shape != (len(row_ids), num_sub_vectors):
+                raise ValueError(
+                    "cuVS transform returned incompatible PQ codes shape: "
+                    f"expected {(len(row_ids), num_sub_vectors)}, got {pq_codes.shape}"
+                )
 
             partition_batch = _make_shuffle_batch(
                 row_ids,
-                _backend_to_numpy(partitions, xp),
+                partitions,
                 pq_codes,
                 num_sub_vectors,
             )
@@ -388,50 +393,18 @@ def train_ivf_pq_on_cuvs(
     num_bits: int = 8,
     filter_nan: bool = True,
 ) -> Tuple[np.ndarray, np.ndarray]:
-    if accelerator != "cuvs":
-        raise ValueError("cuVS acceleration only supports accelerator='cuvs'")
-    if num_bits != 8:
-        raise ValueError("cuVS IVF_PQ integration currently supports only num_bits=8")
-
-    dimension = dataset.schema.field(column).type.list_size
-    if dimension % num_sub_vectors != 0:
-        raise ValueError(
-            "cuVS IVF_PQ integration requires vector dimension to be divisible by "
-            "num_sub_vectors"
-        )
-
-    if dataset.schema.field(column).nullable and filter_nan:
-        filt = f"{column} is not null"
-    else:
-        filt = None
-
-    num_rows = dataset.count_rows(filter=filt)
-    if num_rows == 0:
-        raise ValueError("cuVS training requires at least one non-null training vector")
-
-    train_rows = max(1, min(num_rows, max(num_partitions * sample_rate, 256 * 256)))
-    trainset = _sample_training_table(dataset, column, train_rows, filt)
-    matrix = _column_to_numpy(trainset, column)
-
-    ivf_pq = _require_cuvs()
-    build_params = ivf_pq.IndexParams(
-        n_lists=num_partitions,
-        metric=_metric_to_cuvs(metric_type),
-        kmeans_n_iters=max_iters,
-        kmeans_trainset_fraction=_estimate_trainset_fraction(
-            matrix.shape[0], num_partitions, sample_rate
-        ),
-        pq_bits=num_bits,
-        pq_dim=num_sub_vectors,
-        codebook_kind="subspace",
-        force_random_rotation=False,
-        add_data_on_build=False,
+    _, centroids, pq_codebook = _train_ivf_pq_index_on_cuvs(
+        dataset,
+        column,
+        num_partitions,
+        metric_type,
+        accelerator,
+        num_sub_vectors,
+        sample_rate=sample_rate,
+        max_iters=max_iters,
+        num_bits=num_bits,
+        filter_nan=filter_nan,
     )
-
-    index = ivf_pq.build(build_params, matrix)
-
-    centroids = _normalize_centroids(index, num_partitions, dimension)
-    pq_codebook = _normalize_pq_codebook(index, num_sub_vectors, num_bits, dimension)
     return centroids, pq_codebook
 
 
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
index 485f4425694..1b24962a700 100644
--- a/python/python/lance/dataset.py
+++ b/python/python/lance/dataset.py
@@ -2961,13 +2961,13 @@ def _create_index_impl(
 
             if use_cuvs:
                 from .cuvs import (
+                    _train_ivf_pq_index_on_cuvs,
                     one_pass_assign_ivf_pq_on_cuvs,
-                    one_pass_train_ivf_pq_on_cuvs,
                 )
 
                 LOGGER.info("Doing one-pass ivfpq cuVS training")
                 timers["ivf+pq_train:start"] = time.time()
-                ivf_centroids, pq_codebook = one_pass_train_ivf_pq_on_cuvs(
+                trained_index, ivf_centroids, pq_codebook = _train_ivf_pq_index_on_cuvs(
                     self,
                     column[0],
                     num_partitions,
@@ -2995,6 +2995,7 @@ def _create_index_impl(
                     accelerator,
                     ivf_centroids,
                     pq_codebook,
+                    trained_index=trained_index,
                     batch_size=20480,
                     filter_nan=filter_nan,
                 )
diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py
index 7b49137e7a1..a54475e1ae0 100644
--- a/python/python/tests/test_vector_index.py
+++ b/python/python/tests/test_vector_index.py
@@ -542,6 +542,10 @@ def test_create_index_cuvs_dispatch(tmp_path, monkeypatch):
     dataset = lance.write_dataset(tbl, tmp_path)
     calls = {}
 
+    class FakeIndex:
+        pq_dim = 16
+        pq_bits = 8
+
     def fake_train(
         dataset_arg,
         column,
@@ -566,6 +570,7 @@ def fake_train(
         calls["num_bits"] = num_bits
         calls["filter_nan"] = filter_nan
         return (
+            FakeIndex(),
             np.random.randn(num_partitions, 128).astype(np.float32),
             np.random.randn(num_sub_vectors, 256, 128 // num_sub_vectors).astype(
                 np.float32
@@ -579,6 +584,7 @@ def fake_assign(
         accelerator,
         ivf_centroids,
         pq_codebook,
+        trained_index=None,
         dst_dataset_uri=None,
         batch_size=20480,
         *,
@@ -588,6 +594,7 @@ def fake_assign(
         calls["assign_column"] = column
         calls["assign_metric_type"] = metric_type
         calls["assign_accelerator"] = accelerator
+        calls["assign_trained_index"] = trained_index
         calls["assign_batch_size"] = batch_size
         calls["assign_filter_nan"] = filter_nan
 
@@ -612,7 +619,7 @@ def fake_assign(
         ]
         return shuffle_ds_uri, shuffle_buffers
 
-    monkeypatch.setattr(lance_cuvs, "one_pass_train_ivf_pq_on_cuvs", fake_train)
+    monkeypatch.setattr(lance_cuvs, "_train_ivf_pq_index_on_cuvs", fake_train)
     monkeypatch.setattr(lance_cuvs, "one_pass_assign_ivf_pq_on_cuvs", fake_assign)
 
     dataset = dataset.create_index(
@@ -631,6 +638,7 @@ def fake_assign(
     assert calls["assign_column"] == "vector"
     assert calls["assign_metric_type"] == "L2"
     assert calls["assign_accelerator"] == "cuvs"
+    assert isinstance(calls["assign_trained_index"], FakeIndex)
     assert dataset.stats.index_stats("vector_idx")["index_type"] == "IVF_PQ"
 
 
@@ -747,13 +755,45 @@ def copy_to_host(self):
     assert array.dtype == np.float32
 
 
-def test_one_pass_assign_ivf_pq_on_cuvs_writes_shuffle_buffers(tmp_path):
+def test_one_pass_assign_ivf_pq_on_cuvs_writes_shuffle_buffers(tmp_path, monkeypatch):
     tbl = create_table(nvec=32, ndim=16)
     dataset = lance.write_dataset(tbl, tmp_path / "cuvs_assign_src")
 
     ivf_centroids = np.random.randn(4, 16).astype(np.float32)
     pq_codebook = np.random.randn(4, 256, 4).astype(np.float32)
 
+    class FakeDeviceTensor:
+        def __init__(self, array):
+            self._array = array
+
+        def copy_to_host(self):
+            return self._array
+
+    class FakeCupyArray:
+        def __init__(self, array):
+            self.array = array
+
+    class FakeCupyModule:
+        @staticmethod
+        def asarray(array):
+            return FakeCupyArray(array)
+
+    class FakeIndex:
+        pq_dim = 4
+        pq_bits = 8
+
+    class FakeIvfPqModule:
+        @staticmethod
+        def transform(index, vectors):
+            assert isinstance(index, FakeIndex)
+            assert isinstance(vectors, FakeCupyArray)
+            labels = np.arange(len(vectors.array), dtype=np.uint32) % 4
+            pq_codes = np.full((len(vectors.array), 4), 7, dtype=np.uint8)
+            return FakeDeviceTensor(labels), FakeDeviceTensor(pq_codes)
+
+    monkeypatch.setattr(lance_cuvs, "_require_cuvs", lambda: FakeIvfPqModule())
+    monkeypatch.setattr(lance_cuvs, "_optional_cupy", lambda: FakeCupyModule())
+
     shuffle_uri, shuffle_buffers = lance_cuvs.one_pass_assign_ivf_pq_on_cuvs(
         dataset,
         "vector",
@@ -761,6 +801,7 @@ def test_one_pass_assign_ivf_pq_on_cuvs_writes_shuffle_buffers(tmp_path):
         "cuvs",
         ivf_centroids,
         pq_codebook,
+        trained_index=FakeIndex(),
         batch_size=8,
     )
 

From f9c5d03d8f34e680bb54f03a626d219bcde62e8f Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Thu, 2 Apr 2026 17:04:49 +0800
Subject: [PATCH 08/21] python: route cuvs precomputed shuffle to v3 files

---
 python/python/lance/cuvs.py                | 135 ++++++++++----
 python/python/lance/dataset.py             |   6 +-
 python/python/tests/test_vector_index.py   |  16 +-
 rust/lance-index/src/vector/v3/shuffler.rs | 203 ++++++++++++++++++++-
 rust/lance/src/index/vector/builder.rs     |  70 ++++++-
 5 files changed, 376 insertions(+), 54 deletions(-)

diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py
index 8498be7720b..ed8cad83907 100644
--- a/python/python/lance/cuvs.py
+++ b/python/python/lance/cuvs.py
@@ -3,6 +3,8 @@
 
 from __future__ import annotations
 
+import json
+import os
 import re
 import tempfile
 from importlib import import_module
@@ -12,6 +14,7 @@
 import pyarrow.compute as pc
 
 from .dependencies import numpy as np
+from .file import LanceFileWriter
 from .log import LOGGER
 from .util import _normalize_metric_type
 
@@ -195,24 +198,108 @@ def _make_shuffle_batch(
     row_ids: np.ndarray,
     partitions: np.ndarray,
     pq_codes: np.ndarray,
+    num_partitions: int,
     num_sub_vectors: int,
-) -> pa.RecordBatch:
-    pq_values = pa.array(pq_codes.reshape(-1))
+) -> tuple[pa.RecordBatch, pa.RecordBatch]:
+    sort_indices = np.argsort(partitions, kind="stable")
+    row_ids = row_ids[sort_indices]
+    partitions = partitions[sort_indices]
+    pq_codes = pq_codes[sort_indices]
+
+    pq_values = pa.array(pq_codes.reshape(-1), type=pa.uint8())
     pq_code_array = pa.FixedSizeListArray.from_arrays(pq_values, num_sub_vectors)
-    return pa.RecordBatch.from_arrays(
+    partition_counts = np.bincount(partitions, minlength=num_partitions).astype(
+        np.uint64, copy=False
+    )
+    offsets = np.cumsum(partition_counts, dtype=np.uint64)
+    data_batch = pa.RecordBatch.from_arrays(
         [
             pa.array(row_ids, type=pa.uint64()),
-            pa.array(partitions, type=pa.uint32()),
             pq_code_array,
         ],
         schema=pa.schema(
             [
-                pa.field("row_id", pa.uint64()),
-                pa.field("__ivf_part_id", pa.uint32()),
+                pa.field("_rowid", pa.uint64()),
                 pa.field("__pq_code", pa.list_(pa.uint8(), list_size=num_sub_vectors)),
             ]
         ),
     )
+    offsets_batch = pa.RecordBatch.from_arrays(
+        [pa.array(offsets, type=pa.uint64())],
+        schema=pa.schema([pa.field("offset", pa.uint64())]),
+    )
+    return data_batch, offsets_batch
+
+
+def _shuffle_metadata(
+    num_partitions: int, num_batches: int, partition_counts
+) -> dict[str, str]:
+    return {
+        "lance:shuffle:num_partitions": str(num_partitions),
+        "lance:shuffle:num_batches": str(num_batches),
+        "lance:shuffle:partition_counts": json.dumps(list(partition_counts)),
+        "lance:shuffle:total_loss": "0.0",
+    }
+
+
+def _write_v3_shuffle_files(
+    output_root: str,
+    batches: Iterator[tuple[pa.RecordBatch, pa.RecordBatch]],
+    *,
+    num_partitions: int,
+    num_sub_vectors: int,
+) -> list[str]:
+    os.makedirs(output_root, exist_ok=True)
+    data_path = os.path.join(output_root, "shuffle_data.lance")
+    offsets_path = os.path.join(output_root, "shuffle_offsets.lance")
+
+    data_schema = pa.schema(
+        [
+            pa.field("_rowid", pa.uint64()),
+            pa.field("__pq_code", pa.list_(pa.uint8(), list_size=num_sub_vectors)),
+        ]
+    )
+    offsets_schema = pa.schema([pa.field("offset", pa.uint64())])
+
+    data_writer = None
+    offsets_writer = LanceFileWriter(offsets_path, offsets_schema)
+    total_partition_counts = np.zeros(num_partitions, dtype=np.uint64)
+    global_row_count = np.uint64(0)
+    num_batches = 0
+
+    for data_batch, offsets_batch in batches:
+        if data_writer is None:
+            data_writer = LanceFileWriter(data_path, data_batch.schema)
+        data_writer.write_batch(data_batch)
+
+        offsets = offsets_batch.column(0).to_numpy()
+        adjusted_offsets = offsets + global_row_count
+        offsets_writer.write_batch(
+            pa.RecordBatch.from_arrays(
+                [pa.array(adjusted_offsets, type=pa.uint64())],
+                schema=offsets_schema,
+            )
+        )
+        last_offset = np.uint64(0)
+        for idx, offset in enumerate(offsets):
+            total_partition_counts[idx] += np.uint64(offset) - last_offset
+            last_offset = np.uint64(offset)
+        global_row_count += np.uint64(data_batch.num_rows)
+        num_batches += 1
+
+    if data_writer is None:
+        data_writer = LanceFileWriter(data_path, data_schema)
+
+    metadata = _shuffle_metadata(
+        num_partitions, num_batches, total_partition_counts.tolist()
+    )
+    for key, value in metadata.items():
+        data_writer.add_schema_metadata(key, value)
+        offsets_writer.add_schema_metadata(key, value)
+
+    data_writer.close()
+    offsets_writer.close()
+    return ["shuffle_data.lance", "shuffle_offsets.lance"]
 
 
 def _train_ivf_pq_index_on_cuvs(
@@ -283,7 +370,7 @@ def one_pass_assign_ivf_pq_on_cuvs(
     pq_codebook: np.ndarray,
     trained_index=None,
     dst_dataset_uri: str | Path | None = None,
-    batch_size: int = 1024 * 10 * 4,
+    batch_size: int = 1024 * 128,
     *,
     filter_nan: bool = True,
 ):
@@ -308,7 +395,9 @@ def one_pass_assign_ivf_pq_on_cuvs(
     progress = _make_progress(num_rows)
     progress.set_description("Assigning partitions and computing pq codes")
 
-    def _partition_and_pq_codes_assignment() -> Iterator[pa.RecordBatch]:
+    def _partition_and_pq_codes_assignment() -> Iterator[
+        tuple[pa.RecordBatch, pa.RecordBatch]
+    ]:
         for batch in dataset.to_batches(
             columns=[column],
             filter=filt,
@@ -342,41 +431,25 @@ def _partition_and_pq_codes_assignment() -> Iterator[pa.RecordBatch]:
                 row_ids,
                 partitions,
                 pq_codes,
+                ivf_centroids.shape[0],
                 num_sub_vectors,
             )
-            progress.update(partition_batch.num_rows)
+            progress.update(len(row_ids))
             yield partition_batch
 
-    output_schema = pa.schema(
-        [
-            pa.field("row_id", pa.uint64()),
-            pa.field("__ivf_part_id", pa.uint32()),
-            pa.field("__pq_code", pa.list_(pa.uint8(), list_size=num_sub_vectors)),
-        ]
-    )
-    rbr = pa.RecordBatchReader.from_batches(
-        output_schema, _partition_and_pq_codes_assignment()
-    )
     if dst_dataset_uri is None:
         dst_dataset_uri = tempfile.mkdtemp()
         if re.search(r".:\\", dst_dataset_uri) is not None:
             dst_dataset_uri = dst_dataset_uri.replace("\\", "/", 1)
-
-    from . import write_dataset
-
-    ds = write_dataset(
-        rbr,
-        dst_dataset_uri,
-        schema=output_schema,
-        data_storage_version="legacy",
+    shuffle_buffers = _write_v3_shuffle_files(
+        str(dst_dataset_uri),
+        _partition_and_pq_codes_assignment(),
+        num_partitions=ivf_centroids.shape[0],
+        num_sub_vectors=num_sub_vectors,
     )
 
     progress.close()
     LOGGER.info("Saved precomputed pq_codes to %s", dst_dataset_uri)
-
-    shuffle_buffers = [
-        data_file.path for frag in ds.get_fragments() for data_file in frag.data_files()
-    ]
     return str(dst_dataset_uri), shuffle_buffers
 
 
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
index 1b24962a700..5dee7767918 100644
--- a/python/python/lance/dataset.py
+++ b/python/python/lance/dataset.py
@@ -2996,7 +2996,7 @@ def _create_index_impl(
                     ivf_centroids,
                     pq_codebook,
                     trained_index=trained_index,
-                    batch_size=20480,
+                    batch_size=1024 * 128,
                     filter_nan=filter_nan,
                 )
                 timers["ivf+pq_assign:end"] = time.time()
@@ -3005,9 +3005,7 @@ def _create_index_impl(
                 )
                 LOGGER.info("cuVS ivf+pq transform time: %ss", ivfpq_assign_time)
                 kwargs["precomputed_shuffle_buffers"] = shuffle_buffers
-                kwargs["precomputed_shuffle_buffers_path"] = os.path.join(
-                    shuffle_output_dir, "data"
-                )
+                kwargs["precomputed_shuffle_buffers_path"] = shuffle_output_dir
             else:
                 from .vector import (
                     one_pass_assign_ivf_pq_on_accelerator,
diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py
index a54475e1ae0..cf0bfe6e2bf 100644
--- a/python/python/tests/test_vector_index.py
+++ b/python/python/tests/test_vector_index.py
@@ -805,13 +805,17 @@ def transform(index, vectors):
         batch_size=8,
     )
 
-    shuffle_ds = lance.dataset(shuffle_uri)
-    batch = next(shuffle_ds.to_batches())
+    from lance.file import LanceFileReader
 
-    assert len(shuffle_buffers) > 0
-    assert batch.column("row_id").type == pa.uint64()
-    assert batch.column("__ivf_part_id").type == pa.uint32()
-    assert batch.column("__pq_code").type == pa.list_(pa.uint8(), 4)
+    data_reader = LanceFileReader(str(Path(shuffle_uri) / "shuffle_data.lance"))
+    offsets_reader = LanceFileReader(str(Path(shuffle_uri) / "shuffle_offsets.lance"))
+    data_batch = next(data_reader.read_all(batch_size=1024).to_batches())
+    offsets_batch = next(offsets_reader.read_all(batch_size=1024).to_batches())
+
+    assert shuffle_buffers == ["shuffle_data.lance", "shuffle_offsets.lance"]
+    assert data_batch.column("_rowid").type == pa.uint64()
+    assert data_batch.column("__pq_code").type == pa.list_(pa.uint8(), 4)
+    assert offsets_batch.column("offset").type == pa.uint64()
 
 
 def test_use_index(dataset, tmp_path):
diff --git a/rust/lance-index/src/vector/v3/shuffler.rs b/rust/lance-index/src/vector/v3/shuffler.rs
index 0bf714df237..45c719d523a 100644
--- a/rust/lance-index/src/vector/v3/shuffler.rs
+++ b/rust/lance-index/src/vector/v3/shuffler.rs
@@ -4,6 +4,7 @@
 //! Shuffler is a component that takes a stream of record batches and shuffles them into
 //! the corresponding IVF partitions.
 
+use std::collections::HashMap;
 use std::ops::Range;
 use std::sync::atomic::AtomicU64;
 use std::sync::{Arc, Mutex};
@@ -36,6 +37,13 @@ use object_store::path::Path;
 
 use crate::vector::{LOSS_METADATA_KEY, PART_ID_COLUMN};
 
+const SHUFFLE_NUM_PARTITIONS_METADATA_KEY: &str = "lance:shuffle:num_partitions";
+const SHUFFLE_NUM_BATCHES_METADATA_KEY: &str = "lance:shuffle:num_batches";
+const SHUFFLE_PARTITION_COUNTS_METADATA_KEY: &str = "lance:shuffle:partition_counts";
+const SHUFFLE_TOTAL_LOSS_METADATA_KEY: &str = "lance:shuffle:total_loss";
+pub const SHUFFLE_DATA_FILE_NAME: &str = "shuffle_data.lance";
+pub const SHUFFLE_OFFSETS_FILE_NAME: &str = "shuffle_offsets.lance";
+
 #[async_trait::async_trait]
 /// A reader that can read the shuffled partitions.
 pub trait ShuffleReader: Send + Sync {
@@ -435,7 +443,7 @@ impl Shuffler for TwoFileShuffler {
         );
 
         // Create data file writer
-        let data_path = self.output_dir.child("shuffle_data.lance");
+        let data_path = self.output_dir.child(SHUFFLE_DATA_FILE_NAME);
         let spill_path = self.output_dir.child("shuffle_data.spill");
         let writer = self.object_store.create(&data_path).await?;
         let mut file_writer = FileWriter::try_new(
@@ -446,7 +454,7 @@ impl Shuffler for TwoFileShuffler {
         .with_page_metadata_spill(self.object_store.clone(), spill_path);
 
         // Create offsets file writer
-        let offsets_path = self.output_dir.child("shuffle_offsets.lance");
+        let offsets_path = self.output_dir.child(SHUFFLE_OFFSETS_FILE_NAME);
         let spill_path = self.output_dir.child("shuffle_offsets.spill");
         let writer = self.object_store.create(&offsets_path).await?;
         let mut offsets_writer = FileWriter::try_new(
@@ -527,13 +535,37 @@ impl Shuffler for TwoFileShuffler {
                 .await?;
         }
 
+        let partition_counts_json = serde_json::to_string(&partition_counts).map_err(|e| {
+            Error::invalid_input(format!("Failed to serialize shuffle partition counts: {e}"))
+        })?;
+        let num_partitions_str = num_partitions.to_string();
+        let num_batches_str = num_batches
+            .load(std::sync::atomic::Ordering::Relaxed)
+            .to_string();
+        let total_loss_str = total_loss.lock().unwrap().to_string();
+        for writer in [&mut file_writer, &mut offsets_writer] {
+            writer.add_schema_metadata(
+                SHUFFLE_NUM_PARTITIONS_METADATA_KEY,
+                num_partitions_str.clone(),
+            );
+            writer.add_schema_metadata(SHUFFLE_NUM_BATCHES_METADATA_KEY, num_batches_str.clone());
+            writer.add_schema_metadata(
+                SHUFFLE_PARTITION_COUNTS_METADATA_KEY,
+                partition_counts_json.clone(),
+            );
+            writer.add_schema_metadata(SHUFFLE_TOTAL_LOSS_METADATA_KEY, total_loss_str.clone());
+        }
+
         // Finish files
         file_writer.finish().await?;
         offsets_writer.finish().await?;
 
-        let num_batches = num_batches.load(std::sync::atomic::Ordering::Relaxed);
-
-        let total_loss_val = *total_loss.lock().unwrap();
+        let num_batches = num_batches_str
+            .parse::<u64>()
+            .expect("num_batches string was produced from u64");
+        let total_loss_val = total_loss_str
+            .parse::<f64>()
+            .expect("total_loss string was produced from f64");
 
         TwoFileShuffleReader::try_new(
             self.object_store.clone(),
@@ -558,6 +590,46 @@ pub struct TwoFileShuffleReader {
 }
 
 impl TwoFileShuffleReader {
+    pub async fn try_open_existing(
+        object_store: Arc<ObjectStore>,
+        output_dir: Path,
+        data_file: impl AsRef<str>,
+        offsets_file: impl AsRef<str>,
+    ) -> Result<Box<dyn ShuffleReader>> {
+        let scheduler_config = SchedulerConfig::max_bandwidth(&object_store);
+        let scheduler = ScanScheduler::new(object_store, scheduler_config);
+
+        let file_reader = FileReader::try_open(
+            scheduler
+                .open_file(
+                    &output_dir.child(data_file.as_ref()),
+                    &CachedFileSize::unknown(),
+                )
+                .await?,
+            None,
+            Arc::<DecoderPlugins>::default(),
+            &LanceCache::no_cache(),
+            FileReaderOptions::default(),
+        )
+        .await?;
+
+        let offsets_reader = FileReader::try_open(
+            scheduler
+                .open_file(
+                    &output_dir.child(offsets_file.as_ref()),
+                    &CachedFileSize::unknown(),
+                )
+                .await?,
+            None,
+            Arc::<DecoderPlugins>::default(),
+            &LanceCache::no_cache(),
+            FileReaderOptions::default(),
+        )
+        .await?;
+
+        Self::from_existing_readers(scheduler, file_reader, offsets_reader)
+    }
+
     async fn try_new(
         object_store: Arc<ObjectStore>,
         output_dir: Path,
@@ -573,7 +645,7 @@ impl TwoFileShuffleReader {
         let scheduler_config = SchedulerConfig::max_bandwidth(&object_store);
         let scheduler = ScanScheduler::new(object_store, scheduler_config);
 
-        let data_path = output_dir.child("shuffle_data.lance");
+        let data_path = output_dir.child(SHUFFLE_DATA_FILE_NAME);
         let file_reader = FileReader::try_open(
             scheduler
                 .open_file(&data_path, &CachedFileSize::unknown())
@@ -585,7 +657,7 @@ impl TwoFileShuffleReader {
         )
         .await?;
 
-        let offsets_path = output_dir.child("shuffle_offsets.lance");
+        let offsets_path = output_dir.child(SHUFFLE_OFFSETS_FILE_NAME);
         let offsets_reader = FileReader::try_open(
             scheduler
                 .open_file(&offsets_path, &CachedFileSize::unknown())
@@ -608,6 +680,87 @@ impl TwoFileShuffleReader {
         }))
     }
 
+    fn from_existing_readers(
+        scheduler: Arc<ScanScheduler>,
+        file_reader: FileReader,
+        offsets_reader: FileReader,
+    ) -> Result<Box<dyn ShuffleReader>> {
+        let metadata: &HashMap<String, String> = &offsets_reader.schema().metadata;
+
+        let num_partitions = metadata
+            .get(SHUFFLE_NUM_PARTITIONS_METADATA_KEY)
+            .ok_or_else(|| {
+                Error::invalid_input(format!(
+                    "Missing required metadata key {SHUFFLE_NUM_PARTITIONS_METADATA_KEY} in precomputed V3 shuffle offsets file"
+                ))
+            })?
+            .parse::<usize>()
+            .map_err(|e| {
+                Error::invalid_input(format!(
+                    "Invalid value for {SHUFFLE_NUM_PARTITIONS_METADATA_KEY}: {e}"
+                ))
+            })?;
+        let num_batches = metadata
+            .get(SHUFFLE_NUM_BATCHES_METADATA_KEY)
+            .ok_or_else(|| {
+                Error::invalid_input(format!(
+                    "Missing required metadata key {SHUFFLE_NUM_BATCHES_METADATA_KEY} in precomputed V3 shuffle offsets file"
+                ))
+            })?
+            .parse::<u64>()
+            .map_err(|e| {
+                Error::invalid_input(format!(
+                    "Invalid value for {SHUFFLE_NUM_BATCHES_METADATA_KEY}: {e}"
+                ))
+            })?;
+        let partition_counts = serde_json::from_str::<Vec<u64>>(
+            metadata
+                .get(SHUFFLE_PARTITION_COUNTS_METADATA_KEY)
+                .ok_or_else(|| {
+                    Error::invalid_input(format!(
+                        "Missing required metadata key {SHUFFLE_PARTITION_COUNTS_METADATA_KEY} in precomputed V3 shuffle offsets file"
+                    ))
+                })?,
+        )
+        .map_err(|e| {
+            Error::invalid_input(format!(
+                "Invalid value for {SHUFFLE_PARTITION_COUNTS_METADATA_KEY}: {e}"
+            ))
+        })?;
+        if partition_counts.len() != num_partitions {
+            return Err(Error::invalid_input(format!(
+                "Precomputed V3 shuffle partition count length {} does not match num_partitions {}",
+                partition_counts.len(),
+                num_partitions
+            )));
+        }
+        let total_loss = metadata
+            .get(SHUFFLE_TOTAL_LOSS_METADATA_KEY)
+            .map(|value| {
+                value.parse::<f64>().map_err(|e| {
+                    Error::invalid_input(format!(
+                        "Invalid value for {SHUFFLE_TOTAL_LOSS_METADATA_KEY}: {e}"
+                    ))
+                })
+            })
+            .transpose()?
+            .unwrap_or(0.0);
+
+        if num_batches == 0 {
+            return Ok(Box::new(EmptyReader));
+        }
+
+        Ok(Box::new(Self {
+            _scheduler: scheduler,
+            file_reader,
+            offsets_reader,
+            num_partitions,
+            num_batches,
+            partition_counts,
+            total_loss,
+        }))
+    }
+
     async fn partition_ranges(&self, partition_id: usize) -> Result<Vec<Range<u64>>> {
         let mut positions = Vec::with_capacity(self.num_batches as usize * 2);
         for batch_idx in 0..self.num_batches {
@@ -844,6 +997,42 @@ mod tests {
         assert!((loss - 4.25).abs() < 1e-10, "expected 4.25, got {}", loss);
     }
 
+    #[tokio::test]
+    async fn test_two_file_shuffler_reopen_existing_files() {
+        let dir = TempStrDir::default();
+        let output_dir = Path::from(dir.as_ref());
+        let num_partitions = 3;
+
+        let batch1 = make_batch(&[0, 1, 2], &[10, 20, 30], Some(1.5));
+        let batch2 = make_batch(&[2, 0, 1, 0], &[40, 50, 60, 70], Some(2.0));
+
+        let shuffler = TwoFileShuffler::new(output_dir.clone(), num_partitions);
+        let stream = batches_to_stream(vec![batch1, batch2]);
+        let _ = shuffler.shuffle(stream).await.unwrap();
+
+        let reopened = TwoFileShuffleReader::try_open_existing(
+            Arc::new(ObjectStore::local()),
+            output_dir,
+            SHUFFLE_DATA_FILE_NAME,
+            SHUFFLE_OFFSETS_FILE_NAME,
+        )
+        .await
+        .unwrap();
+
+        assert_eq!(reopened.partition_size(0).unwrap(), 3);
+        assert_eq!(reopened.partition_size(1).unwrap(), 2);
+        assert_eq!(reopened.partition_size(2).unwrap(), 2);
+
+        let p0 = collect_partition(reopened.as_ref(), 0).await.unwrap();
+        let vals: &Int32Array = p0.column_by_name("val").unwrap().as_primitive();
+        let mut v: Vec<i32> = vals.iter().map(|x| x.unwrap()).collect();
+        v.sort();
+        assert_eq!(v, vec![10, 50, 70]);
+
+        let loss = reopened.total_loss().unwrap();
+        assert!((loss - 3.5).abs() < 1e-10, "expected 3.5, got {}", loss);
+    }
+
     #[tokio::test]
     async fn test_two_file_shuffler_single_batch() {
         let dir = TempStrDir::default();
diff --git a/rust/lance/src/index/vector/builder.rs b/rust/lance/src/index/vector/builder.rs
index 24298cbba18..9a7001834d0 100644
--- a/rust/lance/src/index/vector/builder.rs
+++ b/rust/lance/src/index/vector/builder.rs
@@ -3,6 +3,7 @@
 
 use std::collections::HashSet;
 use std::future;
+use std::path::Path as StdPath;
 use std::sync::Arc;
 use std::{collections::HashMap, pin::Pin};
 
@@ -44,7 +45,10 @@ use lance_index::vector::quantizer::{QuantizerMetadata, QuantizerStorage};
 use lance_index::vector::shared::{SupportedIvfIndexType, write_unified_ivf_and_index_metadata};
 use lance_index::vector::storage::STORAGE_METADATA_KEY;
 use lance_index::vector::transform::Flatten;
-use lance_index::vector::v3::shuffler::{EmptyReader, IvfShufflerReader};
+use lance_index::vector::v3::shuffler::{
+    EmptyReader, IvfShufflerReader, SHUFFLE_DATA_FILE_NAME, SHUFFLE_OFFSETS_FILE_NAME,
+    TwoFileShuffleReader,
+};
 use lance_index::vector::v3::subindex::SubIndexType;
 use lance_index::vector::{LOSS_METADATA_KEY, PART_ID_COLUMN, PQ_CODE_COLUMN, VectorIndex};
 use lance_index::vector::{PART_ID_FIELD, ivf::storage::IvfModel};
@@ -141,6 +145,43 @@ type BuildStream<S, Q> =
     Pin<Box<dyn Stream<Item = Result<Option<(<Q as Quantization>::Storage, S, f64)>>> + Send>>;
 
 impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q> {
+    async fn try_open_precomputed_v3_shuffle_reader(
+        &self,
+        root: &Path,
+        files: &[String],
+    ) -> Result<Option<Arc<dyn ShuffleReader>>> {
+        if files.len() != 2 {
+            return Ok(None);
+        }
+
+        let mut data_file = None;
+        let mut offsets_file = None;
+        for file in files {
+            let Some(file_name) = StdPath::new(file).file_name() else {
+                return Ok(None);
+            };
+            match file_name.to_string_lossy().as_ref() {
+                SHUFFLE_DATA_FILE_NAME => data_file = Some(SHUFFLE_DATA_FILE_NAME),
+                SHUFFLE_OFFSETS_FILE_NAME => offsets_file = Some(SHUFFLE_OFFSETS_FILE_NAME),
+                _ => return Ok(None),
+            }
+        }
+        let (Some(data_file), Some(offsets_file)) = (data_file, offsets_file) else {
+            return Ok(None);
+        };
+
+        Ok(Some(
+            TwoFileShuffleReader::try_open_existing(
+                Arc::new(ObjectStore::local()),
+                root.clone(),
+                data_file,
+                offsets_file,
+            )
+            .await?
+            .into(),
+        ))
+    }
+
     #[allow(clippy::too_many_arguments)]
     pub fn new(
         dataset: Dataset,
@@ -528,13 +569,30 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q>
             .as_ref()
             .and_then(|p| p.precomputed_shuffle_buffers.as_ref())
         {
-            Some((uri, _)) => {
+            Some((uri, files)) => {
+                if let Some(reader) = self
+                    .try_open_precomputed_v3_shuffle_reader(uri, files)
+                    .await?
+                {
+                    log::info!("shuffle with precomputed V3 shuffle files from {}", uri);
+                    self.shuffle_reader = Some(reader);
+                    return Ok(());
+                }
+
                 let uri = to_local_path(uri);
-                // the uri points to data directory,
-                // so need to trim the "data" suffix for reading the dataset
-                let uri = uri.trim_end_matches("data");
+                let uri = if StdPath::new(&uri)
+                    .file_name()
+                    .is_some_and(|name| name == "data")
+                {
+                    StdPath::new(&uri)
+                        .parent()
+                        .map(|path| path.to_string_lossy().to_string())
+                        .unwrap_or(uri)
+                } else {
+                    uri
+                };
                 log::info!("shuffle with precomputed shuffle buffers from {}", uri);
-                let ds = Dataset::open(uri).await?;
+                let ds = Dataset::open(&uri).await?;
                 ds.scan().try_into_stream().await?
             }
             _ => {

From ec99cda4bafe9be41a38aff747107a6cb9ba2e7e Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Thu, 2 Apr 2026 22:17:50 +0800
Subject: [PATCH 09/21] python: fix cuvs pq_dim semantics

---
 python/python/lance/cuvs.py              |  2 +-
 python/python/tests/test_vector_index.py | 37 ++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py
index ed8cad83907..a3ad062d8fa 100644
--- a/python/python/lance/cuvs.py
+++ b/python/python/lance/cuvs.py
@@ -349,7 +349,7 @@ def _train_ivf_pq_index_on_cuvs(
             matrix.shape[0], num_partitions, sample_rate
         ),
         pq_bits=num_bits,
-        pq_dim=num_sub_vectors,
+        pq_dim=dimension // num_sub_vectors,
         codebook_kind="subspace",
         force_random_rotation=False,
         add_data_on_build=False,
diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py
index cf0bfe6e2bf..3c7d92290b7 100644
--- a/python/python/tests/test_vector_index.py
+++ b/python/python/tests/test_vector_index.py
@@ -743,6 +743,43 @@ def build(build_params, matrix):
     assert pq_codebook.shape == (4, 256, 4)
 
 
+def test_train_ivf_pq_on_cuvs_uses_subvector_dimension_for_pq_dim(
+    tmp_path, monkeypatch
+):
+    dataset = lance.write_dataset(create_table(nvec=32, ndim=16), tmp_path)
+    calls = {}
+
+    class FakeIndex:
+        centers = np.random.randn(4, 16).astype(np.float32)
+        pq_centers = np.random.randn(2, 256, 8).astype(np.float32)
+
+    class FakeIvfPqModule:
+        class IndexParams:
+            def __init__(self, **kwargs):
+                calls.update(kwargs)
+
+        @staticmethod
+        def build(build_params, matrix):
+            assert matrix.shape[1] == 16
+            return FakeIndex()
+
+    monkeypatch.setattr(lance_cuvs, "_require_cuvs", lambda: FakeIvfPqModule())
+
+    centroids, pq_codebook = lance_cuvs.train_ivf_pq_on_cuvs(
+        dataset,
+        "vector",
+        4,
+        "l2",
+        "cuvs",
+        2,
+        sample_rate=4,
+    )
+
+    assert calls["pq_dim"] == 8
+    assert centroids.shape == (4, 16)
+    assert pq_codebook.shape == (2, 256, 8)
+
+
 def test_cuvs_as_numpy_prefers_copy_to_host():
     class FakeDeviceTensor:
         def copy_to_host(self):

From 1991638af63f13b8ae45e8d1d9d0373af2d5f767 Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Fri, 3 Apr 2026 02:27:12 +0800
Subject: [PATCH 10/21] python: fix cuvs pq_dim semantics

---
 AGENTS.md                                |  6 ++++
 python/python/lance/cuvs.py              | 12 ++++++-
 python/python/tests/test_vector_index.py | 46 ++++++++++++++++++++++--
 3 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 8543d23521a..ec2b3e21773 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -53,6 +53,12 @@ cd test_data && docker compose up -d
 AWS_DEFAULT_REGION=us-east-1 pytest --run-integration python/tests/test_s3_ddb.py
 ```
 
+### Benchmarking Discipline
+
+- Benchmark machines must use release builds only. For Python bindings, always run `maturin develop --release` before collecting any timing data.
+- Never use `maturin develop` without `--release` on a benchmark host. If a dev-profile rebuild is needed for functional debugging, use a different machine or clearly discard all performance results collected afterwards.
+- Before trusting a benchmark result, verify the mounted benchmark volume and the active build profile.
+
 ## Coding Standards
 
 ### General
diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py
index a3ad062d8fa..07ae8c01a6b 100644
--- a/python/python/lance/cuvs.py
+++ b/python/python/lance/cuvs.py
@@ -155,6 +155,7 @@ def _normalize_pq_codebook(
     pq_centers = _as_numpy(index.pq_centers)
 
     expected_shapes = {
+        (subvector_dim, num_sub_vectors, pq_book_size): (1, 2, 0),
         (num_sub_vectors, subvector_dim, pq_book_size): (0, 2, 1),
         (num_sub_vectors, pq_book_size, subvector_dim): None,
     }
@@ -349,7 +350,7 @@ def _train_ivf_pq_index_on_cuvs(
             matrix.shape[0], num_partitions, sample_rate
         ),
         pq_bits=num_bits,
-        pq_dim=dimension // num_sub_vectors,
+        pq_dim=num_sub_vectors,
         codebook_kind="subspace",
         force_random_rotation=False,
         add_data_on_build=False,
@@ -391,6 +392,15 @@ def one_pass_assign_ivf_pq_on_cuvs(
             "one_pass_assign_ivf_pq_on_cuvs requires a trained cuVS index for "
             "single-node transform"
         )
+    transform_code_width = (trained_index.pq_dim * trained_index.pq_bits + 7) // 8
+    if transform_code_width != num_sub_vectors:
+        raise ValueError(
+            "cuVS transform output is incompatible with Lance IVF_PQ for this "
+            "configuration: expected "
+            f"{num_sub_vectors} PQ code columns, but cuVS will produce "
+            f"{transform_code_width}. Use a configuration where "
+            "ceil(pq_dim * pq_bits / 8) == num_sub_vectors."
+        )
 
     progress = _make_progress(num_rows)
     progress.set_description("Assigning partitions and computing pq codes")
diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py
index 3c7d92290b7..4448c7b57f5 100644
--- a/python/python/tests/test_vector_index.py
+++ b/python/python/tests/test_vector_index.py
@@ -743,7 +743,7 @@ def build(build_params, matrix):
     assert pq_codebook.shape == (4, 256, 4)
 
 
-def test_train_ivf_pq_on_cuvs_uses_subvector_dimension_for_pq_dim(
+def test_train_ivf_pq_on_cuvs_uses_num_sub_vectors_for_pq_dim(
     tmp_path, monkeypatch
 ):
     dataset = lance.write_dataset(create_table(nvec=32, ndim=16), tmp_path)
@@ -775,11 +775,22 @@ def build(build_params, matrix):
         sample_rate=4,
     )
 
-    assert calls["pq_dim"] == 8
+    assert calls["pq_dim"] == 2
     assert centroids.shape == (4, 16)
     assert pq_codebook.shape == (2, 256, 8)
 
 
+def test_normalize_pq_codebook_accepts_subvector_dim_first_layout():
+    class FakeIndex:
+        pq_centers = np.random.randn(8, 16, 256).astype(np.float32)
+
+    pq_codebook = lance_cuvs._normalize_pq_codebook(
+        FakeIndex(), num_sub_vectors=16, num_bits=8, dimension=128
+    )
+
+    assert pq_codebook.shape == (16, 256, 8)
+
+
 def test_cuvs_as_numpy_prefers_copy_to_host():
     class FakeDeviceTensor:
         def copy_to_host(self):
@@ -855,6 +866,37 @@ def transform(index, vectors):
     assert offsets_batch.column("offset").type == pa.uint64()
 
 
+def test_one_pass_assign_ivf_pq_on_cuvs_rejects_incompatible_transform_width(
+    tmp_path,
+    monkeypatch,
+):
+    tbl = create_table(nvec=32, ndim=128)
+    dataset = lance.write_dataset(tbl, tmp_path / "cuvs_assign_incompatible")
+
+    ivf_centroids = np.random.randn(4, 128).astype(np.float32)
+    pq_codebook = np.random.randn(16, 256, 8).astype(np.float32)
+    monkeypatch.setattr(lance_cuvs, "_require_cuvs", lambda: object())
+
+    class FakeIndex:
+        pq_dim = 8
+        pq_bits = 8
+
+    with pytest.raises(
+        ValueError,
+        match="cuVS transform output is incompatible with Lance IVF_PQ",
+    ):
+        lance_cuvs.one_pass_assign_ivf_pq_on_cuvs(
+            dataset,
+            "vector",
+            "l2",
+            "cuvs",
+            ivf_centroids,
+            pq_codebook,
+            trained_index=FakeIndex(),
+            batch_size=8,
+        )
+
+
 def test_use_index(dataset, tmp_path):
     ann_ds = lance.write_dataset(dataset.to_table(), tmp_path / "indexed.lance")
     ann_ds = ann_ds.create_index(

From 54c29d863f3ecf1a1df593f61f2afad985d237aa Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Fri, 3 Apr 2026 02:33:46 +0800
Subject: [PATCH 11/21] python: revert cuvs shuffle dataset integration

---
 python/python/lance/cuvs.py              | 160 +++++------------------
 python/python/tests/test_vector_index.py |  15 +--
 2 files changed, 40 insertions(+), 135 deletions(-)

diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py
index 07ae8c01a6b..0ab9fcd6d24 100644
--- a/python/python/lance/cuvs.py
+++ b/python/python/lance/cuvs.py
@@ -3,8 +3,6 @@
 
 from __future__ import annotations
 
-import json
-import os
 import re
 import tempfile
 from importlib import import_module
@@ -14,7 +12,6 @@
 import pyarrow.compute as pc
 
 from .dependencies import numpy as np
-from .file import LanceFileWriter
 from .log import LOGGER
 from .util import _normalize_metric_type
 
@@ -195,114 +192,6 @@ def _sample_training_table(
     return dataset.to_table(columns=[column], filter=filt, limit=train_rows)
 
 
-def _make_shuffle_batch(
-    row_ids: np.ndarray,
-    partitions: np.ndarray,
-    pq_codes: np.ndarray,
-    num_partitions: int,
-    num_sub_vectors: int,
-) -> tuple[pa.RecordBatch, pa.RecordBatch]:
-    sort_indices = np.argsort(partitions, kind="stable")
-    row_ids = row_ids[sort_indices]
-    partitions = partitions[sort_indices]
-    pq_codes = pq_codes[sort_indices]
-
-    pq_values = pa.array(pq_codes.reshape(-1), type=pa.uint8())
-    pq_code_array = pa.FixedSizeListArray.from_arrays(pq_values, num_sub_vectors)
-    partition_counts = np.bincount(partitions, minlength=num_partitions).astype(
-        np.uint64, copy=False
-    )
-    offsets = np.cumsum(partition_counts, dtype=np.uint64)
-    data_batch = pa.RecordBatch.from_arrays(
-        [
-            pa.array(row_ids, type=pa.uint64()),
-            pq_code_array,
-        ],
-        schema=pa.schema(
-            [
-                pa.field("_rowid", pa.uint64()),
-                pa.field("__pq_code", pa.list_(pa.uint8(), list_size=num_sub_vectors)),
-            ]
-        ),
-    )
-    offsets_batch = pa.RecordBatch.from_arrays(
-        [pa.array(offsets, type=pa.uint64())],
-        schema=pa.schema([pa.field("offset", pa.uint64())]),
-    )
-    return data_batch, offsets_batch
-
-
-def _shuffle_metadata(
-    num_partitions: int, num_batches: int, partition_counts
-) -> dict[str, str]:
-    return {
-        "lance:shuffle:num_partitions": str(num_partitions),
-        "lance:shuffle:num_batches": str(num_batches),
-        "lance:shuffle:partition_counts": json.dumps(list(partition_counts)),
-        "lance:shuffle:total_loss": "0.0",
-    }
-
-
-def _write_v3_shuffle_files(
-    output_root: str,
-    batches: Iterator[tuple[pa.RecordBatch, pa.RecordBatch]],
-    *,
-    num_partitions: int,
-    num_sub_vectors: int,
-) -> list[str]:
-    os.makedirs(output_root, exist_ok=True)
-    data_path = os.path.join(output_root, "shuffle_data.lance")
-    offsets_path = os.path.join(output_root, "shuffle_offsets.lance")
-
-    data_schema = pa.schema(
-        [
-            pa.field("_rowid", pa.uint64()),
-            pa.field("__pq_code", pa.list_(pa.uint8(), list_size=num_sub_vectors)),
-        ]
-    )
-    offsets_schema = pa.schema([pa.field("offset", pa.uint64())])
-
-    data_writer = None
-    offsets_writer = LanceFileWriter(offsets_path, offsets_schema)
-    total_partition_counts = np.zeros(num_partitions, dtype=np.uint64)
-    global_row_count = np.uint64(0)
-    num_batches = 0
-
-    for data_batch, offsets_batch in batches:
-        if data_writer is None:
-            data_writer = LanceFileWriter(data_path, data_batch.schema)
-        data_writer.write_batch(data_batch)
-
-        offsets = offsets_batch.column(0).to_numpy()
-        adjusted_offsets = offsets + global_row_count
-        offsets_writer.write_batch(
-            pa.RecordBatch.from_arrays(
-                [pa.array(adjusted_offsets, type=pa.uint64())],
-                schema=offsets_schema,
-            )
-        )
-        last_offset = np.uint64(0)
-        for idx, offset in enumerate(offsets):
-            total_partition_counts[idx] += np.uint64(offset) - last_offset
-            last_offset = np.uint64(offset)
-        global_row_count += np.uint64(data_batch.num_rows)
-        num_batches += 1
-
-    if data_writer is None:
-        data_writer = LanceFileWriter(data_path, data_schema)
-
-    metadata = _shuffle_metadata(
-        num_partitions, num_batches, total_partition_counts.tolist()
-    )
-    for key, value in metadata.items():
-        data_writer.add_schema_metadata(key, value)
-        offsets_writer.add_schema_metadata(key, value)
-
-    data_writer.close()
-    offsets_writer.close()
-    return ["shuffle_data.lance", "shuffle_offsets.lance"]
-
-
 def _train_ivf_pq_index_on_cuvs(
     dataset,
     column: str,
@@ -375,6 +264,8 @@ def one_pass_assign_ivf_pq_on_cuvs(
     *,
     filter_nan: bool = True,
 ):
+    from . import write_dataset
+
     if accelerator != "cuvs":
         raise ValueError("cuVS acceleration only supports accelerator='cuvs'")
 
@@ -405,9 +296,15 @@ def one_pass_assign_ivf_pq_on_cuvs(
     progress = _make_progress(num_rows)
     progress.set_description("Assigning partitions and computing pq codes")
 
-    def _partition_and_pq_codes_assignment() -> Iterator[
-        tuple[pa.RecordBatch, pa.RecordBatch]
-    ]:
+    output_schema = pa.schema(
+        [
+            pa.field("row_id", pa.uint64()),
+            pa.field("__ivf_part_id", pa.uint32()),
+            pa.field("__pq_code", pa.list_(pa.uint8(), list_size=num_sub_vectors)),
+        ]
+    )
+
+    def _partition_and_pq_codes_assignment() -> Iterator[pa.RecordBatch]:
         for batch in dataset.to_batches(
             columns=[column],
             filter=filt,
@@ -437,26 +334,37 @@ def _partition_and_pq_codes_assignment() -> Iterator[
                     f"expected {(len(row_ids), num_sub_vectors)}, got {pq_codes.shape}"
                 )
 
-            partition_batch = _make_shuffle_batch(
-                row_ids,
-                partitions,
-                pq_codes,
-                ivf_centroids.shape[0],
-                num_sub_vectors,
+            pq_values = pa.array(pq_codes.reshape(-1), type=pa.uint8())
+            pq_code_array = pa.FixedSizeListArray.from_arrays(
+                pq_values, num_sub_vectors
+            )
+            yield pa.RecordBatch.from_arrays(
+                [
+                    pa.array(row_ids, type=pa.uint64()),
+                    pa.array(partitions, type=pa.uint32()),
+                    pq_code_array,
+                ],
+                schema=output_schema,
             )
             progress.update(len(row_ids))
-            yield partition_batch
 
     if dst_dataset_uri is None:
         dst_dataset_uri = tempfile.mkdtemp()
         if re.search(r".:\\", dst_dataset_uri) is not None:
             dst_dataset_uri = dst_dataset_uri.replace("\\", "/", 1)
-    shuffle_buffers = _write_v3_shuffle_files(
-        str(dst_dataset_uri),
-        _partition_and_pq_codes_assignment(),
-        num_partitions=ivf_centroids.shape[0],
-        num_sub_vectors=num_sub_vectors,
+
+    reader = pa.RecordBatchReader.from_batches(
+        output_schema, _partition_and_pq_codes_assignment()
+    )
+    ds = write_dataset(
+        reader,
+        dst_dataset_uri,
+        schema=output_schema,
+        data_storage_version="2.2",
     )
+    shuffle_buffers = [
+        data_file.path for frag in ds.get_fragments() for data_file in frag.data_files()
+    ]
 
     progress.close()
     LOGGER.info("Saved precomputed pq_codes to %s", dst_dataset_uri)
diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py
index 4448c7b57f5..7ba5feefda6 100644
--- a/python/python/tests/test_vector_index.py
+++ b/python/python/tests/test_vector_index.py
@@ -853,17 +853,14 @@ def transform(index, vectors):
         batch_size=8,
     )
 
-    from lance.file import LanceFileReader
+    shuffle_ds = lance.dataset(shuffle_uri)
+    data_batch = next(shuffle_ds.to_batches(batch_size=1024))
 
-    data_reader = LanceFileReader(str(Path(shuffle_uri) / "shuffle_data.lance"))
-    offsets_reader = LanceFileReader(str(Path(shuffle_uri) / "shuffle_offsets.lance"))
-    data_batch = next(data_reader.read_all(batch_size=1024).to_batches())
-    offsets_batch = next(offsets_reader.read_all(batch_size=1024).to_batches())
-
-    assert shuffle_buffers == ["shuffle_data.lance", "shuffle_offsets.lance"]
-    assert data_batch.column("_rowid").type == pa.uint64()
+    assert len(shuffle_buffers) > 0
+    assert all(path.endswith(".lance") for path in shuffle_buffers)
+    assert data_batch.column("row_id").type == pa.uint64()
+    assert data_batch.column("__ivf_part_id").type == pa.uint32()
     assert data_batch.column("__pq_code").type == pa.list_(pa.uint8(), 4)
-    assert offsets_batch.column("offset").type == pa.uint64()
 
 
 def test_one_pass_assign_ivf_pq_on_cuvs_rejects_incompatible_transform_width(

From 5ecefc4572b27d111bdb06e582f91742bd000a24 Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Fri, 3 Apr 2026 12:31:32 +0800
Subject: [PATCH 12/21] Support precomputed encoded datasets for IVF_PQ build

---
 python/python/lance/cuvs.py                   |  56 +++
 python/python/lance/dataset.py                |  15 +-
 python/python/tests/test_vector_index.py      |  19 +-
 python/src/dataset.rs                         |  13 +-
 rust/lance-index/src/vector/ivf/builder.rs    |  10 +-
 rust/lance-index/src/vector/ivf/shuffler.rs   |  10 +-
 rust/lance/src/index/vector.rs                |   2 +
 rust/lance/src/index/vector/builder.rs        | 101 ++++-
 .../lance/src/index/vector/encoded_dataset.rs | 370 ++++++++++++++++++
 rust/lance/src/index/vector/ivf.rs            |  30 +-
 rust/lance/src/index/vector/ivf/builder.rs    |   5 +-
 11 files changed, 600 insertions(+), 31 deletions(-)
 create mode 100644 rust/lance/src/index/vector/encoded_dataset.rs

diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py
index 0ab9fcd6d24..ecd3173c249 100644
--- a/python/python/lance/cuvs.py
+++ b/python/python/lance/cuvs.py
@@ -3,6 +3,7 @@
 
 from __future__ import annotations
 
+import json
 import re
 import tempfile
 from importlib import import_module
@@ -18,6 +19,16 @@
 if TYPE_CHECKING:
     from pathlib import Path
 
+PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY = (
+    "lance:index_build:precomputed_encoded_partition_sizes"
+)
+PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY = (
+    "lance:index_build:precomputed_encoded_partition_fragment_ids"
+)
+PRECOMPUTED_ENCODED_TOTAL_LOSS_METADATA_KEY = (
+    "lance:index_build:precomputed_encoded_total_loss"
+)
+
 
 def is_cuvs_accelerator(accelerator: object) -> bool:
     return accelerator == "cuvs"
@@ -100,6 +111,45 @@ def _column_to_numpy(table: pa.Table | pa.RecordBatch, column: str) -> np.ndarra
     return _coerce_float_matrix(np.asarray(values), column=column)
 
 
+def _annotate_precomputed_encoded_dataset(
+    dataset,
+    partition_sizes: list[int],
+    *,
+    total_loss: float | None = None,
+) -> None:
+    partition_fragments = [[] for _ in range(len(partition_sizes))]
+    for fragment in dataset.get_fragments():
+        fragment_partitions = set()
+        scanner = (
+            dataset.scanner(columns=["__ivf_part_id"])
+            .with_fragments([fragment])
+            .to_scanner()
+        )
+        for batch in scanner.to_batches():
+            fragment_partitions.update(
+                int(partition_id)
+                for partition_id in np.unique(
+                    batch.column("__ivf_part_id").to_numpy(zero_copy_only=False)
+                )
+            )
+        for partition_id in fragment_partitions:
+            partition_fragments[partition_id].append(int(fragment.metadata.id))
+
+    metadata = {
+        PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY: json.dumps(
+            [int(size) for size in partition_sizes]
+        ),
+        PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY: json.dumps(
+            partition_fragments
+        ),
+    }
+    if total_loss is not None:
+        metadata[PRECOMPUTED_ENCODED_TOTAL_LOSS_METADATA_KEY] = json.dumps(
+            float(total_loss)
+        )
+    dataset.update_metadata(metadata)
+
+
 def _as_numpy(array_like) -> np.ndarray:
     if isinstance(array_like, np.ndarray):
         return array_like
@@ -295,6 +345,8 @@ def one_pass_assign_ivf_pq_on_cuvs(
 
     progress = _make_progress(num_rows)
     progress.set_description("Assigning partitions and computing pq codes")
+    num_partitions = ivf_centroids.shape[0]
+    partition_sizes = np.zeros(num_partitions, dtype=np.int64)
 
     output_schema = pa.schema(
         [
@@ -327,6 +379,7 @@ def _partition_and_pq_codes_assignment() -> Iterator[pa.RecordBatch]:
                 trained_index, _to_cuvs_transform_input(vectors)
             )
             partitions = _as_numpy(partitions).astype(np.uint32, copy=False)
+            partition_sizes[:] += np.bincount(partitions, minlength=num_partitions)
             pq_codes = _as_numpy(pq_codes).astype(np.uint8, copy=False)
             if pq_codes.shape != (len(row_ids), num_sub_vectors):
                 raise ValueError(
@@ -362,6 +415,9 @@ def _partition_and_pq_codes_assignment() -> Iterator[pa.RecordBatch]:
         schema=output_schema,
         data_storage_version="2.2",
     )
+    _annotate_precomputed_encoded_dataset(
+        ds, partition_sizes.astype(int).tolist()
+    )
     shuffle_buffers = [
         data_file.path for frag in ds.get_fragments() for data_file in frag.data_files()
     ]
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
index 5dee7767918..be5f7cd2c6f 100644
--- a/python/python/lance/dataset.py
+++ b/python/python/lance/dataset.py
@@ -2985,10 +2985,7 @@ def _create_index_impl(
                 )
                 LOGGER.info("cuVS ivf+pq training time: %ss", ivfpq_train_time)
                 timers["ivf+pq_assign:start"] = time.time()
-                (
-                    shuffle_output_dir,
-                    shuffle_buffers,
-                ) = one_pass_assign_ivf_pq_on_cuvs(
+                shuffle_output_dir, _ = one_pass_assign_ivf_pq_on_cuvs(
                     self,
                     column[0],
                     metric,
@@ -3004,8 +3001,7 @@ def _create_index_impl(
                     timers["ivf+pq_assign:end"] - timers["ivf+pq_assign:start"]
                 )
                 LOGGER.info("cuVS ivf+pq transform time: %ss", ivfpq_assign_time)
-                kwargs["precomputed_shuffle_buffers"] = shuffle_buffers
-                kwargs["precomputed_shuffle_buffers_path"] = shuffle_output_dir
+                kwargs["precomputed_encoded_dataset_uri"] = shuffle_output_dir
             else:
                 from .vector import (
                     one_pass_assign_ivf_pq_on_accelerator,
@@ -3213,6 +3209,13 @@ def _create_index_impl(
                 "Temporary shuffle buffers stored at %s, you may want to delete it.",
                 kwargs["precomputed_shuffle_buffers_path"],
             )
+        if "precomputed_encoded_dataset_uri" in kwargs.keys() and os.path.exists(
+            kwargs["precomputed_encoded_dataset_uri"]
+        ):
+            LOGGER.info(
+                "Temporary precomputed encoded dataset stored at %s, you may want to delete it.",
+                kwargs["precomputed_encoded_dataset_uri"],
+            )
         return index
 
     def create_index(
diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py
index 7ba5feefda6..11c4fdb7ad0 100644
--- a/python/python/tests/test_vector_index.py
+++ b/python/python/tests/test_vector_index.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright The Lance Authors
 
 import logging
+import json
 import os
 import platform
 import random
@@ -612,6 +613,9 @@ def fake_assign(
             ),
             shuffle_ds_uri,
         )
+        lance_cuvs._annotate_precomputed_encoded_dataset(
+            shuffle_ds, [len(row_ids), 0, 0, 0]
+        )
         shuffle_buffers = [
             data_file.path
             for frag in shuffle_ds.get_fragments()
@@ -803,7 +807,7 @@ def copy_to_host(self):
     assert array.dtype == np.float32
 
 
-def test_one_pass_assign_ivf_pq_on_cuvs_writes_shuffle_buffers(tmp_path, monkeypatch):
+def test_one_pass_assign_ivf_pq_on_cuvs_writes_encoded_dataset(tmp_path, monkeypatch):
     tbl = create_table(nvec=32, ndim=16)
     dataset = lance.write_dataset(tbl, tmp_path / "cuvs_assign_src")
 
@@ -861,6 +865,19 @@ def transform(index, vectors):
     assert data_batch.column("row_id").type == pa.uint64()
     assert data_batch.column("__ivf_part_id").type == pa.uint32()
     assert data_batch.column("__pq_code").type == pa.list_(pa.uint8(), 4)
+    metadata = shuffle_ds.metadata()
+    assert json.loads(
+        metadata[
+            lance_cuvs.PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY
+        ]
+    ) == [8, 8, 8, 8]
+    partition_fragments = json.loads(
+        metadata[
+            lance_cuvs.PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY
+        ]
+    )
+    assert len(partition_fragments) == 4
+    assert all(partition_fragments)
 
 
 def test_one_pass_assign_ivf_pq_on_cuvs_rejects_incompatible_transform_width(
diff --git a/python/src/dataset.rs b/python/src/dataset.rs
index 35306636c93..62e852c117e 100644
--- a/python/src/dataset.rs
+++ b/python/src/dataset.rs
@@ -19,7 +19,6 @@ use chrono::{Duration, TimeDelta, Utc};
 use futures::{StreamExt, TryFutureExt};
 use lance_index::vector::bq::RQBuildParams;
 use log::error;
-use object_store::path::Path;
 use pyo3::exceptions::{PyStopIteration, PyTypeError};
 use pyo3::types::{PyBytes, PyInt, PyList, PySet, PyString, PyTuple};
 use pyo3::{IntoPyObjectExt, prelude::*};
@@ -3372,6 +3371,10 @@ fn prepare_vector_index_params(
             ivf_params.precomputed_partitions_file = Some(f.to_string());
         };
 
+        if let Some(uri) = kwargs.get_item("precomputed_encoded_dataset_uri")? {
+            ivf_params.precomputed_encoded_dataset_uri = Some(uri.to_string());
+        };
+
         if let Some(storage_options) = storage_options {
             ivf_params.storage_options = Some(storage_options);
         }
@@ -3381,18 +3384,12 @@ fn prepare_vector_index_params(
             kwargs.get_item("precomputed_shuffle_buffers_path")?,
         ) {
             (Some(l), Some(p)) => {
-                let path = Path::parse(p.to_string()).map_err(|e| {
-                    PyValueError::new_err(format!(
-                        "Failed to parse precomputed_shuffle_buffers_path: {}",
-                        e
-                    ))
-                })?;
                 let list = l
                     .downcast::<PyList>()?
                     .iter()
                     .map(|f| f.to_string())
                     .collect();
-                ivf_params.precomputed_shuffle_buffers = Some((path, list));
+                ivf_params.precomputed_shuffle_buffers = Some((p.to_string(), list));
             }
             (None, None) => {}
             _ => {
diff --git a/rust/lance-index/src/vector/ivf/builder.rs b/rust/lance-index/src/vector/ivf/builder.rs
index 72e05555441..9154adbcd80 100644
--- a/rust/lance-index/src/vector/ivf/builder.rs
+++ b/rust/lance-index/src/vector/ivf/builder.rs
@@ -9,8 +9,6 @@ use std::sync::Arc;
 use arrow_array::cast::AsArray;
 use arrow_array::{Array, FixedSizeListArray, UInt32Array, UInt64Array};
 use futures::TryStreamExt;
-use object_store::path::Path;
-
 use lance_core::error::{Error, Result};
 use lance_io::stream::RecordBatchStream;
 
@@ -48,7 +46,12 @@ pub struct IvfBuildParams {
     /// requires `centroids` to be set
     ///
     /// The input is expected to be (/dir/to/buffers, [buffer1.lance, buffer2.lance, ...])
-    pub precomputed_shuffle_buffers: Option<(Path, Vec<String>)>,
+    pub precomputed_shuffle_buffers: Option<(String, Vec<String>)>,
+
+    /// Precomputed encoded dataset (_rowid/row_id -> partition_id, pq_code).
+    /// Mutually exclusive with `precomputed_partitions_file` and `precomputed_shuffle_buffers`.
+    /// Requires `centroids` to be set.
+    pub precomputed_encoded_dataset_uri: Option<String>,
 
     pub shuffle_partition_batches: usize,
 
@@ -69,6 +72,7 @@ impl Default for IvfBuildParams {
             sample_rate: 256, // See faiss
             precomputed_partitions_file: None,
             precomputed_shuffle_buffers: None,
+            precomputed_encoded_dataset_uri: None,
             shuffle_partition_batches: 1024 * 10,
             shuffle_partition_concurrency: 2,
             storage_options: None,
diff --git a/rust/lance-index/src/vector/ivf/shuffler.rs b/rust/lance-index/src/vector/ivf/shuffler.rs
index f4e03c8f036..f78be7b0be2 100644
--- a/rust/lance-index/src/vector/ivf/shuffler.rs
+++ b/rust/lance-index/src/vector/ivf/shuffler.rs
@@ -246,12 +246,18 @@ pub async fn shuffle_dataset(
     num_partitions: u32,
     shuffle_partition_batches: usize,
     shuffle_partition_concurrency: usize,
-    precomputed_shuffle_buffers: Option<(Path, Vec<String>)>,
+    precomputed_shuffle_buffers: Option<(String, Vec<String>)>,
 ) -> Result<Vec<impl Stream<Item = Result<RecordBatch>>>> {
     // step 1: either use precomputed shuffle files or write shuffle data to a file
     let shuffler = if let Some((path, buffers)) = precomputed_shuffle_buffers {
         info!("Precomputed shuffle files provided, skip calculation of IVF partition.");
-        let mut shuffler = IvfShuffler::try_new(num_partitions, Some(path), true, None)?;
+        if path.contains("://") {
+            return Err(Error::not_supported(
+                "legacy IVF shuffler does not support remote precomputed_shuffle_buffers; use the V3 vector index builder path instead".to_string(),
+            ));
+        }
+        let mut shuffler =
+            IvfShuffler::try_new(num_partitions, Some(Path::parse(&path)?), true, None)?;
         unsafe {
             shuffler.set_unsorted_buffers(&buffers);
         }
diff --git a/rust/lance/src/index/vector.rs b/rust/lance/src/index/vector.rs
index c5c9038403e..13176c3bca8 100644
--- a/rust/lance/src/index/vector.rs
+++ b/rust/lance/src/index/vector.rs
@@ -8,6 +8,7 @@ use std::sync::Arc;
 use std::{any::Any, collections::HashMap};
 
 pub mod builder;
+mod encoded_dataset;
 pub mod ivf;
 pub mod pq;
 pub mod utils;
@@ -1655,6 +1656,7 @@ fn derive_ivf_params(ivf_model: &IvfModel) -> IvfBuildParams {
         sample_rate: 256, // Default
         precomputed_partitions_file: None,
         precomputed_shuffle_buffers: None,
+        precomputed_encoded_dataset_uri: None,
         shuffle_partition_batches: 1024 * 10, // Default
         shuffle_partition_concurrency: 2,     // Default
         storage_options: None,
diff --git a/rust/lance/src/index/vector/builder.rs b/rust/lance/src/index/vector/builder.rs
index 9a7001834d0..258e978f1ac 100644
--- a/rust/lance/src/index/vector/builder.rs
+++ b/rust/lance/src/index/vector/builder.rs
@@ -71,8 +71,11 @@ use lance_index::{
     MIN_PARTITION_SIZE_PERCENT,
 };
 use lance_io::local::to_local_path;
+use lance_io::object_store::{
+    ObjectStore, ObjectStoreParams, ObjectStoreRegistry, StorageOptionsAccessor,
+};
 use lance_io::stream::RecordBatchStream;
-use lance_io::{object_store::ObjectStore, stream::RecordBatchStreamAdapter};
+use lance_io::stream::RecordBatchStreamAdapter;
 use lance_linalg::distance::{DistanceType, Dot, L2, Normalize};
 use lance_linalg::kernels::normalize_fsl;
 use log::info;
@@ -82,12 +85,14 @@ use tracing::{Level, instrument, span};
 
 use crate::Dataset;
 use crate::dataset::ProjectionRequest;
+use crate::dataset::builder::DatasetBuilder;
 use crate::dataset::index::dataset_format_version;
 use crate::index::vector::ivf::v2::PartitionEntry;
 use crate::index::vector::utils::{infer_vector_dim, infer_vector_element_type};
 
 use super::v2::IVFIndex;
 use super::{
+    encoded_dataset::EncodedDatasetShuffleReader,
     ivf::load_precomputed_partitions_if_available,
     utils::{self, get_vector_type},
 };
@@ -145,9 +150,41 @@ type BuildStream<S, Q> =
     Pin<Box<dyn Stream<Item = Result<Option<(<Q as Quantization>::Storage, S, f64)>>> + Send>>;
 
 impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q> {
+    fn precomputed_shuffle_buffers_uri(root: &str) -> String {
+        let uri = root.to_string();
+        if uri.contains("://") {
+            uri
+        } else {
+            to_local_path(&Path::from(root))
+        }
+    }
+
+    fn precomputed_shuffle_buffers_root_uri(root: &str) -> String {
+        let uri = Self::precomputed_shuffle_buffers_uri(root);
+        if uri.ends_with("/data") {
+            uri.trim_end_matches("/data").to_string()
+        } else {
+            uri
+        }
+    }
+
+    fn object_store_params(&self) -> ObjectStoreParams {
+        let mut params = ObjectStoreParams::default();
+        if let Some(storage_options) = self
+            .ivf_params
+            .as_ref()
+            .and_then(|params| params.storage_options.clone())
+        {
+            params.storage_options_accessor = Some(Arc::new(
+                StorageOptionsAccessor::with_static_options(storage_options),
+            ));
+        }
+        params
+    }
+
     async fn try_open_precomputed_v3_shuffle_reader(
         &self,
-        root: &Path,
+        root: &str,
         files: &[String],
     ) -> Result<Option<Arc<dyn ShuffleReader>>> {
         if files.len() != 2 {
@@ -169,11 +206,19 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q>
         let (Some(data_file), Some(offsets_file)) = (data_file, offsets_file) else {
             return Ok(None);
         };
+        let registry = Arc::new(ObjectStoreRegistry::default());
+        let params = self.object_store_params();
+        let (object_store, output_dir) = ObjectStore::from_uri_and_params(
+            registry,
+            &Self::precomputed_shuffle_buffers_root_uri(root),
+            &params,
+        )
+        .await?;
 
         Ok(Some(
             TwoFileShuffleReader::try_open_existing(
-                Arc::new(ObjectStore::local()),
-                root.clone(),
+                object_store,
+                output_dir,
                 data_file,
                 offsets_file,
             )
@@ -182,6 +227,19 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q>
         ))
     }
 
+    async fn try_open_precomputed_encoded_dataset_reader(
+        &self,
+        uri: &str,
+    ) -> Result<Arc<dyn ShuffleReader>> {
+        let storage_options = self
+            .ivf_params
+            .as_ref()
+            .and_then(|params| params.storage_options.as_ref());
+        Ok(Arc::new(
+            EncodedDatasetShuffleReader::try_open(uri, storage_options).await?,
+        ))
+    }
+
     #[allow(clippy::too_many_arguments)]
     pub fn new(
         dataset: Dataset,
@@ -564,6 +622,19 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q>
             return Err(Error::invalid_input("dataset not set before shuffling"));
         };
 
+        if let Some(uri) = self
+            .ivf_params
+            .as_ref()
+            .and_then(|params| params.precomputed_encoded_dataset_uri.as_deref())
+        {
+            log::info!("shuffle with precomputed encoded dataset from {}", uri);
+            self.shuffle_reader = Some(
+                self.try_open_precomputed_encoded_dataset_reader(uri)
+                    .await?,
+            );
+            return Ok(());
+        }
+
         let stream = match self
             .ivf_params
             .as_ref()
@@ -579,7 +650,7 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q>
                     return Ok(());
                 }
 
-                let uri = to_local_path(uri);
+                let uri = Self::precomputed_shuffle_buffers_root_uri(uri);
                 let uri = if StdPath::new(&uri)
                     .file_name()
                     .is_some_and(|name| name == "data")
@@ -592,7 +663,15 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q>
                     uri
                 };
                 log::info!("shuffle with precomputed shuffle buffers from {}", uri);
-                let ds = Dataset::open(&uri).await?;
+                let mut builder = DatasetBuilder::from_uri(&uri);
+                if let Some(storage_options) = self
+                    .ivf_params
+                    .as_ref()
+                    .and_then(|params| params.storage_options.clone())
+                {
+                    builder = builder.with_storage_options(storage_options);
+                }
+                let ds = builder.load().await?;
                 ds.scan().try_into_stream().await?
             }
             _ => {
@@ -2296,4 +2375,14 @@ mod tests {
         let row_ids = batches[0][ROW_ID].as_primitive::<UInt64Type>();
         assert_eq!(row_ids.values(), &[4, 3, 2, 1, 0]);
     }
+
+    #[test]
+    fn precomputed_shuffle_buffer_uri_preserves_remote_uri() {
+        assert_eq!(
+            IvfIndexBuilder::<FlatIndex, FlatQuantizer>::precomputed_shuffle_buffers_root_uri(
+                "s3://bucket/shuffle"
+            ),
+            "s3://bucket/shuffle"
+        );
+    }
 }
diff --git a/rust/lance/src/index/vector/encoded_dataset.rs b/rust/lance/src/index/vector/encoded_dataset.rs
new file mode 100644
index 00000000000..866f903805c
--- /dev/null
+++ b/rust/lance/src/index/vector/encoded_dataset.rs
@@ -0,0 +1,370 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use arrow_schema::Fields;
+use futures::StreamExt;
+use lance_core::utils::tokio::get_num_compute_intensive_cpus;
+use lance_core::{Error, ROW_ID, Result};
+use lance_index::vector::v3::shuffler::ShuffleReader;
+use lance_index::vector::{PART_ID_COLUMN, PQ_CODE_COLUMN};
+use lance_io::stream::{RecordBatchStream, RecordBatchStreamAdapter};
+use lance_table::format::Fragment;
+use log::warn;
+use serde::de::DeserializeOwned;
+
+use crate::Dataset;
+use crate::dataset::builder::DatasetBuilder;
+
+pub(crate) const PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY: &str =
+    "lance:index_build:precomputed_encoded_partition_sizes";
+pub(crate) const PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY: &str =
+    "lance:index_build:precomputed_encoded_partition_fragment_ids";
+pub(crate) const PRECOMPUTED_ENCODED_TOTAL_LOSS_METADATA_KEY: &str =
+    "lance:index_build:precomputed_encoded_total_loss";
+
+const PRECOMPUTED_ROW_ID_COLUMN: &str = "row_id";
+
+pub(crate) struct EncodedDatasetShuffleReader {
+    dataset: Dataset,
+    row_id_column: String,
+    partition_sizes: Vec<usize>,
+    partition_fragments: Option<Vec<Vec<Fragment>>>,
+    total_loss: Option<f64>,
+}
+
+impl EncodedDatasetShuffleReader {
+    pub(crate) async fn try_open(
+        uri: &str,
+        storage_options: Option<&HashMap<String, String>>,
+    ) -> Result<Self> {
+        let mut builder = DatasetBuilder::from_uri(uri);
+        if let Some(storage_options) = storage_options {
+            builder = builder.with_storage_options(storage_options.clone());
+        }
+        let dataset = builder.load().await?;
+        Self::try_new(dataset)
+    }
+
+    pub(crate) fn try_new(dataset: Dataset) -> Result<Self> {
+        let row_id_column = if dataset.schema().field(ROW_ID).is_some() {
+            ROW_ID.to_string()
+        } else if dataset.schema().field(PRECOMPUTED_ROW_ID_COLUMN).is_some() {
+            PRECOMPUTED_ROW_ID_COLUMN.to_string()
+        } else {
+            return Err(Error::invalid_input(format!(
+                "precomputed encoded dataset must contain '{}' or '{}' column",
+                ROW_ID, PRECOMPUTED_ROW_ID_COLUMN
+            )));
+        };
+
+        for required_column in [PART_ID_COLUMN, PQ_CODE_COLUMN] {
+            if dataset.schema().field(required_column).is_none() {
+                return Err(Error::invalid_input(format!(
+                    "precomputed encoded dataset is missing required column '{}'",
+                    required_column
+                )));
+            }
+        }
+
+        let metadata = dataset.metadata();
+        let partition_sizes: Vec<usize> =
+            parse_required_metadata(metadata, PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY)?;
+
+        let partition_fragments = parse_optional_metadata::<Vec<Vec<u64>>>(
+            metadata,
+            PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY,
+        )?
+        .map(|partition_fragment_ids| resolve_partition_fragments(&dataset, partition_fragment_ids))
+        .transpose()?;
+
+        if let Some(partition_fragments) = partition_fragments.as_ref() {
+            if partition_fragments.len() != partition_sizes.len() {
+                return Err(Error::invalid_input(format!(
+                    "metadata '{}' has {} partitions but '{}' has {}",
+                    PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY,
+                    partition_fragments.len(),
+                    PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY,
+                    partition_sizes.len(),
+                )));
+            }
+        }
+
+        let total_loss =
+            parse_optional_metadata::<f64>(metadata, PRECOMPUTED_ENCODED_TOTAL_LOSS_METADATA_KEY)?;
+
+        Ok(Self {
+            dataset,
+            row_id_column,
+            partition_sizes,
+            partition_fragments,
+            total_loss,
+        })
+    }
+
+    fn rename_row_id(
+        stream: impl RecordBatchStream + Unpin + 'static,
+        row_id_idx: usize,
+    ) -> impl RecordBatchStream + Unpin + 'static {
+        let new_schema = Arc::new(arrow_schema::Schema::new(
+            stream
+                .schema()
+                .fields
+                .iter()
+                .enumerate()
+                .map(|(field_idx, field)| {
+                    if field_idx == row_id_idx {
+                        arrow_schema::Field::new(
+                            ROW_ID,
+                            field.data_type().clone(),
+                            field.is_nullable(),
+                        )
+                    } else {
+                        field.as_ref().clone()
+                    }
+                })
+                .collect::<Fields>(),
+        ));
+        RecordBatchStreamAdapter::new(
+            new_schema.clone(),
+            stream.map(move |batch| match batch {
+                Ok(batch) => {
+                    arrow_array::RecordBatch::try_new(new_schema.clone(), batch.columns().to_vec())
+                        .map_err(Error::from)
+                }
+                Err(error) => Err(error),
+            }),
+        )
+    }
+}
+
+#[async_trait::async_trait]
+impl ShuffleReader for EncodedDatasetShuffleReader {
+    async fn read_partition(
+        &self,
+        partition_id: usize,
+    ) -> Result<Option<Box<dyn RecordBatchStream + Unpin + 'static>>> {
+        if partition_id >= self.partition_sizes.len() {
+            return Ok(None);
+        }
+        if self.partition_sizes[partition_id] == 0 {
+            return Ok(None);
+        }
+
+        let mut scanner = self.dataset.scan();
+        scanner.batch_readahead(get_num_compute_intensive_cpus());
+        scanner.project(&[self.row_id_column.as_str(), PART_ID_COLUMN, PQ_CODE_COLUMN])?;
+
+        if let Some(partition_fragments) = self.partition_fragments.as_ref() {
+            let fragments = &partition_fragments[partition_id];
+            if fragments.is_empty() {
+                warn!(
+                    "precomputed encoded dataset metadata has no fragments for non-empty partition {}, falling back to filtered scan",
+                    partition_id
+                );
+            } else {
+                scanner.with_fragments(fragments.clone());
+            }
+        }
+
+        scanner.filter(&format!("{PART_ID_COLUMN} = {partition_id}"))?;
+        let stream = scanner.try_into_stream().await?;
+        if let Some((row_id_idx, _)) = stream.schema().column_with_name(PRECOMPUTED_ROW_ID_COLUMN) {
+            Ok(Some(Box::new(Self::rename_row_id(stream, row_id_idx))))
+        } else {
+            Ok(Some(Box::new(stream)))
+        }
+    }
+
+    fn partition_size(&self, partition_id: usize) -> Result<usize> {
+        Ok(self.partition_sizes.get(partition_id).copied().unwrap_or(0))
+    }
+
+    fn total_loss(&self) -> Option<f64> {
+        self.total_loss
+    }
+}
+
+fn parse_required_metadata<T: DeserializeOwned>(
+    metadata: &HashMap<String, String>,
+    key: &str,
+) -> Result<T> {
+    let value = metadata.get(key).ok_or_else(|| {
+        Error::invalid_input(format!(
+            "precomputed encoded dataset is missing required metadata '{}'",
+            key
+        ))
+    })?;
+    parse_metadata_value(value, key)
+}
+
+fn parse_optional_metadata<T: DeserializeOwned>(
+    metadata: &HashMap<String, String>,
+    key: &str,
+) -> Result<Option<T>> {
+    metadata
+        .get(key)
+        .map(|value| parse_metadata_value(value, key))
+        .transpose()
+}
+
+fn parse_metadata_value<T: DeserializeOwned>(value: &str, key: &str) -> Result<T> {
+    serde_json::from_str(value).map_err(|error| {
+        Error::invalid_input(format!(
+            "failed to parse precomputed encoded dataset metadata '{}' from '{}': {}",
+            key, value, error
+        ))
+    })
+}
+
+fn resolve_partition_fragments(
+    dataset: &Dataset,
+    partition_fragment_ids: Vec<Vec<u64>>,
+) -> Result<Vec<Vec<Fragment>>> {
+    let fragments_by_id = dataset
+        .fragments()
+        .iter()
+        .cloned()
+        .map(|fragment| (fragment.id, fragment))
+        .collect::<HashMap<_, _>>();
+
+    partition_fragment_ids
+        .into_iter()
+        .map(|fragment_ids| {
+            fragment_ids
+                .into_iter()
+                .map(|fragment_id| {
+                    fragments_by_id.get(&fragment_id).cloned().ok_or_else(|| {
+                        Error::invalid_input(format!(
+                            "precomputed encoded dataset metadata references unknown fragment id {}",
+                            fragment_id
+                        ))
+                    })
+                })
+                .collect()
+        })
+        .collect()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use arrow_array::{
+        FixedSizeListArray, RecordBatch, RecordBatchIterator, UInt8Array, UInt32Array, UInt64Array,
+        cast::AsArray,
+    };
+    use futures::TryStreamExt;
+    use lance_arrow::FixedSizeListArrayExt;
+
+    use crate::dataset::WriteParams;
+
+    #[tokio::test]
+    async fn encoded_dataset_reader_reads_mapped_fragments_and_renames_row_id() {
+        let schema = Arc::new(arrow_schema::Schema::new(vec![
+            arrow_schema::Field::new("row_id", arrow_schema::DataType::UInt64, false),
+            arrow_schema::Field::new(PART_ID_COLUMN, arrow_schema::DataType::UInt32, false),
+            arrow_schema::Field::new(
+                PQ_CODE_COLUMN,
+                arrow_schema::DataType::FixedSizeList(
+                    Arc::new(arrow_schema::Field::new(
+                        "item",
+                        arrow_schema::DataType::UInt8,
+                        true,
+                    )),
+                    2,
+                ),
+                true,
+            ),
+        ]));
+
+        let batch1 = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(UInt64Array::from(vec![10_u64, 11])),
+                Arc::new(UInt32Array::from(vec![0_u32, 1])),
+                Arc::new(
+                    FixedSizeListArray::try_new_from_values(UInt8Array::from(vec![1, 2, 3, 4]), 2)
+                        .unwrap(),
+                ),
+            ],
+        )
+        .unwrap();
+        let batch2 = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(UInt64Array::from(vec![12_u64, 13])),
+                Arc::new(UInt32Array::from(vec![1_u32, 1])),
+                Arc::new(
+                    FixedSizeListArray::try_new_from_values(UInt8Array::from(vec![5, 6, 7, 8]), 2)
+                        .unwrap(),
+                ),
+            ],
+        )
+        .unwrap();
+
+        let reader = RecordBatchIterator::new(vec![Ok(batch1), Ok(batch2)], schema);
+        let write_params = WriteParams {
+            max_rows_per_file: 2,
+            max_rows_per_group: 2,
+            ..Default::default()
+        };
+        let mut dataset = Dataset::write(
+            reader,
+            "memory://precomputed-encoded-reader",
+            Some(write_params),
+        )
+        .await
+        .unwrap();
+
+        let fragment_ids = dataset
+            .get_fragments()
+            .into_iter()
+            .map(|fragment| fragment.metadata().id)
+            .collect::<Vec<_>>();
+        assert_eq!(fragment_ids.len(), 2);
+
+        dataset
+            .update_metadata(vec![
+                (
+                    PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY.to_string(),
+                    serde_json::to_string(&vec![1_usize, 3]).unwrap(),
+                ),
+                (
+                    PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY.to_string(),
+                    serde_json::to_string(&vec![
+                        vec![fragment_ids[0] as u64],
+                        vec![fragment_ids[0] as u64, fragment_ids[1] as u64],
+                    ])
+                    .unwrap(),
+                ),
+            ])
+            .await
+            .unwrap();
+
+        let reader = EncodedDatasetShuffleReader::try_new(dataset).unwrap();
+        assert_eq!(reader.partition_size(0).unwrap(), 1);
+        assert_eq!(reader.partition_size(1).unwrap(), 3);
+
+        let stream = reader.read_partition(1).await.unwrap().unwrap();
+        let batches = stream.try_collect::<Vec<_>>().await.unwrap();
+        let row_ids = batches
+            .iter()
+            .flat_map(|batch| {
+                batch[ROW_ID]
+                    .as_primitive::<arrow::datatypes::UInt64Type>()
+                    .values()
+                    .iter()
+                    .copied()
+            })
+            .collect::<Vec<_>>();
+        assert_eq!(row_ids, vec![11, 12, 13]);
+        assert!(
+            batches
+                .iter()
+                .all(|batch| batch.column_by_name("row_id").is_none())
+        );
+    }
+}
diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs
index b67e6ea8e81..1f2b47887db 100644
--- a/rust/lance/src/index/vector/ivf.rs
+++ b/rust/lance/src/index/vector/ivf.rs
@@ -1204,6 +1204,12 @@ fn sanity_check_ivf_params(ivf: &IvfBuildParams) -> Result<()> {
         ));
     }
 
+    if ivf.precomputed_encoded_dataset_uri.is_some() && ivf.centroids.is_none() {
+        return Err(Error::index(
+            "precomputed_encoded_dataset_uri requires centroids to be set".to_string(),
+        ));
+    }
+
     if ivf.precomputed_shuffle_buffers.is_some() && ivf.precomputed_partitions_file.is_some() {
         return Err(Error::index(
             "precomputed_shuffle_buffers and precomputed_partitions_file are mutually exclusive"
@@ -1211,6 +1217,20 @@ fn sanity_check_ivf_params(ivf: &IvfBuildParams) -> Result<()> {
         ));
     }
 
+    if ivf.precomputed_encoded_dataset_uri.is_some() && ivf.precomputed_partitions_file.is_some() {
+        return Err(Error::index(
+            "precomputed_encoded_dataset_uri and precomputed_partitions_file are mutually exclusive"
+                .to_string(),
+        ));
+    }
+
+    if ivf.precomputed_encoded_dataset_uri.is_some() && ivf.precomputed_shuffle_buffers.is_some() {
+        return Err(Error::index(
+            "precomputed_encoded_dataset_uri and precomputed_shuffle_buffers are mutually exclusive"
+                .to_string(),
+        ));
+    }
+
     Ok(())
 }
 
@@ -1222,6 +1242,12 @@ fn sanity_check_params(ivf: &IvfBuildParams, pq: &PQBuildParams) -> Result<()> {
         ));
     }
 
+    if ivf.precomputed_encoded_dataset_uri.is_some() && pq.codebook.is_none() {
+        return Err(Error::index(
+            "precomputed_encoded_dataset_uri requires codebooks to be set".to_string(),
+        ));
+    }
+
     Ok(())
 }
 
@@ -1698,7 +1724,7 @@ async fn write_ivf_pq_file(
     precomputed_partitions: Option<HashMap<u64, u32>>,
     shuffle_partition_batches: usize,
     shuffle_partition_concurrency: usize,
-    precomputed_shuffle_buffers: Option<(Path, Vec<String>)>,
+    precomputed_shuffle_buffers: Option<(String, Vec<String>)>,
 ) -> Result<()> {
     let path = index_dir.child(uuid).child(INDEX_FILE_NAME);
     let mut writer = object_store.create(&path).await?;
@@ -1791,7 +1817,7 @@ async fn write_ivf_hnsw_file(
     precomputed_partitions: Option<HashMap<u64, u32>>,
     shuffle_partition_batches: usize,
     shuffle_partition_concurrency: usize,
-    precomputed_shuffle_buffers: Option<(Path, Vec<String>)>,
+    precomputed_shuffle_buffers: Option<(String, Vec<String>)>,
 ) -> Result<()> {
     let object_store = dataset.object_store();
     let path = dataset.indices_dir().child(uuid).child(INDEX_FILE_NAME);
diff --git a/rust/lance/src/index/vector/ivf/builder.rs b/rust/lance/src/index/vector/ivf/builder.rs
index 9bd1ba95803..bcd47ae4057 100644
--- a/rust/lance/src/index/vector/ivf/builder.rs
+++ b/rust/lance/src/index/vector/ivf/builder.rs
@@ -22,7 +22,6 @@ use lance_index::vector::{ivf::storage::IvfModel, transform::Transformer};
 use lance_io::stream::RecordBatchStreamAdapter;
 use lance_table::io::manifest::ManifestDescribing;
 use log::info;
-use object_store::path::Path;
 use tracing::instrument;
 
 use lance_core::{Error, ROW_ID, Result, traits::DatasetTakeRows};
@@ -55,7 +54,7 @@ pub(super) async fn build_partitions(
     precomputed_partitions: Option<HashMap<u64, u32>>,
     shuffle_partition_batches: usize,
     shuffle_partition_concurrency: usize,
-    precomputed_shuffle_buffers: Option<(Path, Vec<String>)>,
+    precomputed_shuffle_buffers: Option<(String, Vec<String>)>,
 ) -> Result<()> {
     let schema = data.schema();
     if schema.column_with_name(column).is_none() {
@@ -254,7 +253,7 @@ pub(super) async fn build_hnsw_partitions(
     precomputed_partitions: Option<HashMap<u64, u32>>,
     shuffle_partition_batches: usize,
     shuffle_partition_concurrency: usize,
-    precomputed_shuffle_buffers: Option<(Path, Vec<String>)>,
+    precomputed_shuffle_buffers: Option<(String, Vec<String>)>,
 ) -> Result<(Vec<HnswMetadata>, IvfModel)> {
     let schema = data.schema();
     if schema.column_with_name(column).is_none() {

From e3f29f56484b1824b44dfe976f2e4db31573a632 Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Fri, 3 Apr 2026 13:45:56 +0800
Subject: [PATCH 13/21] python: fix fragment scans in cuvs encoded dataset
 metadata

---
 python/python/lance/cuvs.py              |  6 +----
 python/python/tests/test_vector_index.py | 29 ++++++++++++++++++++++++
 2 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py
index ecd3173c249..5940edbdea3 100644
--- a/python/python/lance/cuvs.py
+++ b/python/python/lance/cuvs.py
@@ -120,11 +120,7 @@ def _annotate_precomputed_encoded_dataset(
     partition_fragments = [[] for _ in range(len(partition_sizes))]
     for fragment in dataset.get_fragments():
         fragment_partitions = set()
-        scanner = (
-            dataset.scanner(columns=["__ivf_part_id"])
-            .with_fragments([fragment])
-            .to_scanner()
-        )
+        scanner = fragment.scanner(columns=["__ivf_part_id"])
         for batch in scanner.to_batches():
             fragment_partitions.update(
                 int(partition_id)
diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py
index 11c4fdb7ad0..1d17423bd68 100644
--- a/python/python/tests/test_vector_index.py
+++ b/python/python/tests/test_vector_index.py
@@ -807,6 +807,35 @@ def copy_to_host(self):
     assert array.dtype == np.float32
 
 
+def test_annotate_precomputed_encoded_dataset_scans_fragment_directly(tmp_path):
+    dataset_uri = tmp_path / "encoded_dataset"
+
+    def make_table(partition_ids: list[int], row_id_start: int):
+        part_ids = np.asarray(partition_ids, dtype=np.uint32)
+        row_ids = pa.array(
+            np.arange(row_id_start, row_id_start + len(partition_ids), dtype=np.uint64)
+        )
+        pq_values = pa.array(np.zeros(len(partition_ids) * 4, dtype=np.uint8))
+        pq_codes = pa.FixedSizeListArray.from_arrays(pq_values, 4)
+        return pa.Table.from_arrays(
+            [row_ids, pa.array(part_ids), pq_codes],
+            names=["row_id", "__ivf_part_id", "__pq_code"],
+        )
+
+    ds = lance.write_dataset(make_table([0, 1, 1, 0], 0), dataset_uri)
+    ds = lance.write_dataset(make_table([2, 3, 2, 3], 4), dataset_uri, mode="append")
+
+    lance_cuvs._annotate_precomputed_encoded_dataset(ds, [2, 2, 2, 2])
+
+    metadata = ds.metadata()
+    partition_fragments = json.loads(
+        metadata[
+            lance_cuvs.PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY
+        ]
+    )
+    assert partition_fragments == [[0], [0], [1], [1]]
+
+
 def test_one_pass_assign_ivf_pq_on_cuvs_writes_encoded_dataset(tmp_path, monkeypatch):
     tbl = create_table(nvec=32, ndim=16)
     dataset = lance.write_dataset(tbl, tmp_path / "cuvs_assign_src")

From 769a218c2fb8885f31cca1cbcd02a4e751ccd62e Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Tue, 7 Apr 2026 16:21:57 +0800
Subject: [PATCH 14/21] feat: add partition artifacts for cuvs builds

---
 Cargo.toml                                    |    2 +-
 python/Cargo.lock                             |  112 +-
 python/Cargo.toml                             |    2 +
 python/python/lance/cuvs.py                   |  188 ++-
 python/python/lance/dataset.py                |   11 +-
 python/python/lance/lance/__init__.pyi        |   14 +
 python/python/tests/test_vector_index.py      |  188 ++-
 python/src/dataset.rs                         |    4 +
 python/src/file.rs                            |   77 +
 python/src/indices.rs                         |   91 ++
 python/src/lib.rs                             |    4 +-
 rust/lance-cuvs/Cargo.toml                    |   28 +
 rust/lance-cuvs/src/lib.rs                    | 1237 +++++++++++++++++
 rust/lance-index/src/vector/ivf/builder.rs    |    5 +
 rust/lance/src/index/vector.rs                |    3 +
 rust/lance/src/index/vector/builder.rs        |   27 +
 rust/lance/src/index/vector/ivf.rs            |   37 +
 .../src/index/vector/partition_artifact.rs    |  956 +++++++++++++
 rust/lance/src/index/vector/utils.rs          |    2 +-
 19 files changed, 2913 insertions(+), 75 deletions(-)
 create mode 100644 rust/lance-cuvs/Cargo.toml
 create mode 100644 rust/lance-cuvs/src/lib.rs
 create mode 100644 rust/lance/src/index/vector/partition_artifact.rs

diff --git a/Cargo.toml b/Cargo.toml
index c922eff6b8b..bddb49ed4a3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,7 +22,7 @@ members = [
     "rust/compression/bitpacking",
     "rust/arrow-scalar",
 ]
-exclude = ["python", "java/lance-jni"]
+exclude = ["python", "java/lance-jni", "rust/lance-cuvs"]
 # Python package needs to be built by maturin.
 resolver = "3"
 
diff --git a/python/Cargo.lock b/python/Cargo.lock
index 4507a617872..aa4cfb72154 100644
--- a/python/Cargo.lock
+++ b/python/Cargo.lock
@@ -14,7 +14,7 @@ dependencies = [
  "core_extensions",
  "crossbeam-channel",
  "generational-arena",
- "libloading",
+ "libloading 0.7.4",
  "lock_api",
  "parking_lot",
  "paste",
@@ -1070,6 +1070,26 @@ dependencies = [
  "virtue",
 ]
 
+[[package]]
+name = "bindgen"
+version = "0.72.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
+dependencies = [
+ "bitflags 2.11.0",
+ "cexpr",
+ "clang-sys",
+ "itertools 0.13.0",
+ "log",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash",
+ "shlex",
+ "syn 2.0.117",
+]
+
 [[package]]
 name = "bitflags"
 version = "1.3.2"
@@ -1269,6 +1289,15 @@ version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4f4c707c6a209cbe82d10abd08e1ea8995e9ea937d2550646e02798948992be0"
 
+[[package]]
+name = "cexpr"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
+dependencies = [
+ "nom 7.1.3",
+]
+
 [[package]]
 name = "cfg-if"
 version = "1.0.4"
@@ -1315,6 +1344,17 @@ dependencies = [
  "inout",
 ]
 
+[[package]]
+name = "clang-sys"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
+dependencies = [
+ "glob",
+ "libc",
+ "libloading 0.8.9",
+]
+
 [[package]]
 name = "cmake"
 version = "0.1.58"
@@ -1577,6 +1617,26 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "cuvs"
+version = "26.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9778fa1e16f42539772496e9adba2a29c67dca84bcb0d247795f9cb3135ba87d"
+dependencies = [
+ "cuvs-sys",
+ "ndarray 0.15.6",
+]
+
+[[package]]
+name = "cuvs-sys"
+version = "26.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4cad121da7a7ac908965352ffeac029a93fb0e3a1278a271f7204098b8724e9"
+dependencies = [
+ "bindgen",
+ "cmake",
+]
+
 [[package]]
 name = "darling"
 version = "0.20.11"
@@ -3985,6 +4045,30 @@ dependencies = [
  "url",
 ]
 
+[[package]]
+name = "lance-cuvs"
+version = "5.0.0-beta.2"
+dependencies = [
+ "arrow",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-schema",
+ "cuvs",
+ "cuvs-sys",
+ "futures",
+ "half",
+ "lance",
+ "lance-arrow",
+ "lance-core",
+ "lance-file",
+ "lance-index",
+ "lance-io",
+ "lance-linalg",
+ "log",
+ "ndarray 0.16.1",
+ "tokio",
+]
+
 [[package]]
 name = "lance-datafusion"
 version = "5.0.0-beta.2"
@@ -4167,7 +4251,7 @@ dependencies = [
  "lindera",
  "lindera-tantivy",
  "log",
- "ndarray",
+ "ndarray 0.16.1",
  "num-traits",
  "object_store",
  "prost",
@@ -4468,6 +4552,16 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "libloading"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55"
+dependencies = [
+ "cfg-if",
+ "windows-link",
+]
+
 [[package]]
 name = "liblzma"
 version = "0.4.6"
@@ -4894,6 +4988,19 @@ version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b"
 
+[[package]]
+name = "ndarray"
+version = "0.15.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32"
+dependencies = [
+ "matrixmultiply",
+ "num-complex",
+ "num-integer",
+ "num-traits",
+ "rawpointer",
+]
+
 [[package]]
 name = "ndarray"
 version = "0.16.1"
@@ -5689,6 +5796,7 @@ dependencies = [
  "lance",
  "lance-arrow",
  "lance-core",
+ "lance-cuvs",
  "lance-datafusion",
  "lance-datagen",
  "lance-encoding",
diff --git a/python/Cargo.toml b/python/Cargo.toml
index a3542f7360f..d63a8e113d4 100644
--- a/python/Cargo.toml
+++ b/python/Cargo.toml
@@ -35,6 +35,7 @@ lance = { path = "../rust/lance", features = [
 ] }
 lance-arrow = { path = "../rust/lance-arrow" }
 lance-core = { path = "../rust/lance-core" }
+lance-cuvs = { path = "../rust/lance-cuvs", optional = true }
 lance-datagen = { path = "../rust/lance-datagen", optional = true }
 lance-encoding = { path = "../rust/lance-encoding" }
 lance-file = { path = "../rust/lance-file" }
@@ -75,6 +76,7 @@ bytes = "1.4"
 
 [features]
 default = []
+cuvs = ["dep:lance-cuvs"]
 datagen = ["lance-datagen"]
 fp16kernels = ["lance/fp16kernels"]
 
diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py
index 5940edbdea3..b36fd18c564 100644
--- a/python/python/lance/cuvs.py
+++ b/python/python/lance/cuvs.py
@@ -12,6 +12,8 @@
 import pyarrow as pa
 import pyarrow.compute as pc
 
+from .file import LanceFileSession
+from .lance import PartitionArtifactBuilder
 from .dependencies import numpy as np
 from .log import LOGGER
 from .util import _normalize_metric_type
@@ -29,6 +31,36 @@
     "lance:index_build:precomputed_encoded_total_loss"
 )
 
+PARTITION_ARTIFACT_MANIFEST_VERSION = 1
+PARTITION_ARTIFACT_MANIFEST_FILE_NAME = "manifest.json"
+PARTITION_ARTIFACT_METADATA_FILE_NAME = "metadata.lance"
+PARTITION_ARTIFACT_PARTITIONS_DIR = "partitions"
+DEFAULT_PARTITION_ARTIFACT_BUCKETS = 256
+PARTITION_ARTIFACT_ROW_ID_COLUMN = "_rowid"
+
+try:
+    from . import lance as _lance_ext
+
+    _assign_ivf_pq_on_cuvs_rust_impl = getattr(
+        _lance_ext.indices, "_assign_ivf_pq_on_cuvs_rust"
+    )
+    _train_ivf_pq_on_cuvs_rust_impl = getattr(
+        _lance_ext.indices, "_train_ivf_pq_on_cuvs_rust"
+    )
+except (ImportError, AttributeError):
+    _assign_ivf_pq_on_cuvs_rust_impl = None
+    _train_ivf_pq_on_cuvs_rust_impl = None
+
+
+def _has_rust_cuvs_backend() -> bool:
+    return (
+        _train_ivf_pq_on_cuvs_rust_impl is not None
+        and _assign_ivf_pq_on_cuvs_rust_impl is not None
+    )
+
+def _unwrap_dataset(dataset):
+    return getattr(dataset, "_ds", dataset)
+
 
 def is_cuvs_accelerator(accelerator: object) -> bool:
     return accelerator == "cuvs"
@@ -170,6 +202,97 @@ def _as_numpy(array_like) -> np.ndarray:
     raise TypeError("Unable to convert cuVS output to numpy")
 
 
+def _normalize_artifact_root(path_or_uri: str | Path) -> str:
+    root = str(path_or_uri)
+    if re.search(r".:\\", root) is not None:
+        root = root.replace("\\", "/", 1)
+    return root
+
+
+def _make_metadata_table(
+    ivf_centroids: np.ndarray,
+    pq_codebook: np.ndarray,
+) -> pa.Table:
+    dimension = ivf_centroids.shape[1]
+    subvector_dim = pq_codebook.shape[2]
+    ivf_type = pa.list_(pa.list_(pa.float32(), dimension))
+    pq_type = pa.list_(pa.list_(pa.float32(), subvector_dim))
+    ivf_values = pa.array([ivf_centroids.tolist()], type=ivf_type)
+    pq_values = pa.array(
+        [pq_codebook.reshape(-1, subvector_dim).tolist()],
+        type=pq_type,
+    )
+    return pa.Table.from_arrays(
+        [ivf_values, pq_values],
+        names=["_ivf_centroids", "_pq_codebook"],
+    )
+
+
+def _write_partition_artifact_metadata(
+    session: LanceFileSession,
+    *,
+    ivf_centroids: np.ndarray,
+    pq_codebook: np.ndarray,
+    metric_type: str,
+    num_bits: int,
+) -> None:
+    metadata_table = _make_metadata_table(ivf_centroids, pq_codebook)
+    with session.open_writer(
+        PARTITION_ARTIFACT_METADATA_FILE_NAME,
+        schema=metadata_table.schema,
+        version="2.2",
+    ) as writer:
+        writer.add_schema_metadata("lance:index_build:artifact_version", "1")
+        writer.add_schema_metadata(
+            "lance:index_build:distance_type", _normalize_metric_type(metric_type)
+        )
+        writer.add_schema_metadata(
+            "lance:index_build:num_partitions", str(ivf_centroids.shape[0])
+        )
+        writer.add_schema_metadata(
+            "lance:index_build:num_sub_vectors", str(pq_codebook.shape[0])
+        )
+        writer.add_schema_metadata("lance:index_build:num_bits", str(num_bits))
+        writer.add_schema_metadata("lance:index_build:dimension", str(ivf_centroids.shape[1]))
+        writer.write_batch(metadata_table)
+
+
+def _write_partition_artifact(
+    batches: Iterator[pa.RecordBatch],
+    *,
+    artifact_root: str | Path,
+    ivf_centroids: np.ndarray,
+    pq_codebook: np.ndarray,
+    metric_type: str,
+    num_bits: int,
+    num_partitions: int,
+    total_loss: float | None = None,
+) -> tuple[str, list[str]]:
+    artifact_root = _normalize_artifact_root(artifact_root)
+    session = LanceFileSession(artifact_root)
+    builder = PartitionArtifactBuilder(
+        artifact_root,
+        num_partitions=num_partitions,
+        pq_code_width=pq_codebook.shape[0],
+    )
+    for batch in batches:
+        builder.append_batch(batch)
+
+    _write_partition_artifact_metadata(
+        session,
+        ivf_centroids=ivf_centroids,
+        pq_codebook=pq_codebook,
+        metric_type=metric_type,
+        num_bits=num_bits,
+    )
+    artifact_files = builder.finish(
+        PARTITION_ARTIFACT_METADATA_FILE_NAME,
+        float(total_loss) if total_loss is not None else None,
+    )
+    artifact_files.insert(1, PARTITION_ARTIFACT_METADATA_FILE_NAME)
+    return artifact_root, artifact_files
+
+
 def _to_cuvs_transform_input(matrix: np.ndarray):
     cupy = _optional_cupy()
     if cupy is None:
@@ -251,6 +374,19 @@ def _train_ivf_pq_index_on_cuvs(
     num_bits: int = 8,
     filter_nan: bool = True,
 ):
+    if _has_rust_cuvs_backend():
+        return _train_ivf_pq_on_cuvs_rust_impl(
+            _unwrap_dataset(dataset),
+            column,
+            num_partitions,
+            metric_type,
+            num_sub_vectors,
+            sample_rate=sample_rate,
+            max_iters=max_iters,
+            num_bits=num_bits,
+            filter_nan=filter_nan,
+        )
+
     if accelerator != "cuvs":
         raise ValueError("cuVS acceleration only supports accelerator='cuvs'")
     if num_bits != 8:
@@ -310,7 +446,26 @@ def one_pass_assign_ivf_pq_on_cuvs(
     *,
     filter_nan: bool = True,
 ):
-    from . import write_dataset
+    if _has_rust_cuvs_backend():
+        if accelerator != "cuvs":
+            raise ValueError("cuVS acceleration only supports accelerator='cuvs'")
+        if trained_index is None:
+            raise ValueError(
+                "one_pass_assign_ivf_pq_on_cuvs requires a trained cuVS index for "
+                "single-node transform"
+            )
+        if dst_dataset_uri is None:
+            dst_dataset_uri = tempfile.mkdtemp()
+        artifact_files = _assign_ivf_pq_on_cuvs_rust_impl(
+            _unwrap_dataset(dataset),
+            column,
+            trained_index,
+            str(dst_dataset_uri),
+            batch_size=batch_size,
+            filter_nan=filter_nan,
+        )
+        LOGGER.info("Saved precomputed partition artifact to %s", dst_dataset_uri)
+        return str(dst_dataset_uri), artifact_files
 
     if accelerator != "cuvs":
         raise ValueError("cuVS acceleration only supports accelerator='cuvs'")
@@ -346,7 +501,7 @@ def one_pass_assign_ivf_pq_on_cuvs(
 
     output_schema = pa.schema(
         [
-            pa.field("row_id", pa.uint64()),
+            pa.field(PARTITION_ARTIFACT_ROW_ID_COLUMN, pa.uint64()),
             pa.field("__ivf_part_id", pa.uint32()),
             pa.field("__pq_code", pa.list_(pa.uint8(), list_size=num_sub_vectors)),
         ]
@@ -399,28 +554,19 @@ def _partition_and_pq_codes_assignment() -> Iterator[pa.RecordBatch]:
 
     if dst_dataset_uri is None:
         dst_dataset_uri = tempfile.mkdtemp()
-        if re.search(r".:\\", dst_dataset_uri) is not None:
-            dst_dataset_uri = dst_dataset_uri.replace("\\", "/", 1)
-
-    reader = pa.RecordBatchReader.from_batches(
-        output_schema, _partition_and_pq_codes_assignment()
-    )
-    ds = write_dataset(
-        reader,
-        dst_dataset_uri,
-        schema=output_schema,
-        data_storage_version="2.2",
-    )
-    _annotate_precomputed_encoded_dataset(
-        ds, partition_sizes.astype(int).tolist()
+    artifact_root, artifact_files = _write_partition_artifact(
+        _partition_and_pq_codes_assignment(),
+        artifact_root=dst_dataset_uri,
+        ivf_centroids=ivf_centroids,
+        pq_codebook=pq_codebook,
+        metric_type=metric_type,
+        num_bits=8,
+        num_partitions=num_partitions,
     )
-    shuffle_buffers = [
-        data_file.path for frag in ds.get_fragments() for data_file in frag.data_files()
-    ]
 
     progress.close()
-    LOGGER.info("Saved precomputed pq_codes to %s", dst_dataset_uri)
-    return str(dst_dataset_uri), shuffle_buffers
+    LOGGER.info("Saved precomputed partition artifact to %s", artifact_root)
+    return str(artifact_root), artifact_files
 
 
 def train_ivf_pq_on_cuvs(
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
index be5f7cd2c6f..415ffdb8865 100644
--- a/python/python/lance/dataset.py
+++ b/python/python/lance/dataset.py
@@ -2985,7 +2985,7 @@ def _create_index_impl(
                 )
                 LOGGER.info("cuVS ivf+pq training time: %ss", ivfpq_train_time)
                 timers["ivf+pq_assign:start"] = time.time()
-                shuffle_output_dir, _ = one_pass_assign_ivf_pq_on_cuvs(
+                artifact_root, _ = one_pass_assign_ivf_pq_on_cuvs(
                     self,
                     column[0],
                     metric,
@@ -2996,12 +2996,12 @@ def _create_index_impl(
                     batch_size=1024 * 128,
                     filter_nan=filter_nan,
                 )
+                kwargs["precomputed_partition_artifact_uri"] = artifact_root
                 timers["ivf+pq_assign:end"] = time.time()
                 ivfpq_assign_time = (
                     timers["ivf+pq_assign:end"] - timers["ivf+pq_assign:start"]
                 )
                 LOGGER.info("cuVS ivf+pq transform time: %ss", ivfpq_assign_time)
-                kwargs["precomputed_encoded_dataset_uri"] = shuffle_output_dir
             else:
                 from .vector import (
                     one_pass_assign_ivf_pq_on_accelerator,
@@ -3216,6 +3216,13 @@ def _create_index_impl(
                 "Temporary precomputed encoded dataset stored at %s, you may want to delete it.",
                 kwargs["precomputed_encoded_dataset_uri"],
             )
+        if "precomputed_partition_artifact_uri" in kwargs.keys() and os.path.exists(
+            kwargs["precomputed_partition_artifact_uri"]
+        ):
+            LOGGER.info(
+                "Temporary precomputed partition artifact stored at %s, you may want to delete it.",
+                kwargs["precomputed_partition_artifact_uri"],
+            )
         return index
 
     def create_index(
diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi
index f0be29f39ca..d377f381246 100644
--- a/python/python/lance/lance/__init__.pyi
+++ b/python/python/lance/lance/__init__.pyi
@@ -135,6 +135,20 @@ class LanceFileSession:
     def upload_file(self, local_path: str, remote_path: str) -> None: ...
     def download_file(self, remote_path: str, local_path: str) -> None: ...
 
+class PartitionArtifactBuilder:
+    def __init__(
+        self,
+        uri_or_path: str,
+        num_partitions: int,
+        pq_code_width: int,
+        storage_options: Optional[Dict[str, str]] = None,
+        storage_options_provider: Optional[StorageOptionsProvider] = None,
+    ): ...
+    def append_batch(self, batch: pa.RecordBatch) -> None: ...
+    def finish(
+        self, metadata_file: str, total_loss: Optional[float] = None
+    ) -> List[str]: ...
+
 class LanceFileReader:
     def __init__(
         self,
diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py
index 1d17423bd68..c2b42de2ac1 100644
--- a/python/python/tests/test_vector_index.py
+++ b/python/python/tests/test_vector_index.py
@@ -22,12 +22,18 @@
 import pytest
 from lance import LanceDataset, LanceFragment
 from lance.dataset import VectorIndexReader
+from lance.file import LanceFileReader
 from lance.indices import IndexFileVersion, IndicesBuilder
 from lance.query import MatchQuery, PhraseQuery
 from lance.util import validate_vector_index  # noqa: E402
 from lance.vector import vec_to_table  # noqa: E402
 
 
+def _disable_rust_cuvs_backend(monkeypatch):
+    monkeypatch.setattr(lance_cuvs, "_train_ivf_pq_on_cuvs_rust_impl", None)
+    monkeypatch.setattr(lance_cuvs, "_assign_ivf_pq_on_cuvs_rust_impl", None)
+
+
 def create_table(nvec=1000, ndim=128, nans=0, nullify=False, dtype=np.float32):
     mat = np.random.randn(nvec, ndim)
     if nans > 0:
@@ -585,8 +591,8 @@ def fake_assign(
         accelerator,
         ivf_centroids,
         pq_codebook,
-        trained_index=None,
-        dst_dataset_uri=None,
+        trained_index,
+        dst_path=None,
         batch_size=20480,
         *,
         filter_nan,
@@ -595,36 +601,23 @@ def fake_assign(
         calls["assign_column"] = column
         calls["assign_metric_type"] = metric_type
         calls["assign_accelerator"] = accelerator
+        calls["assign_ivf_centroids"] = ivf_centroids
+        calls["assign_pq_codebook"] = pq_codebook
         calls["assign_trained_index"] = trained_index
         calls["assign_batch_size"] = batch_size
         calls["assign_filter_nan"] = filter_nan
-
-        row_ids = dataset_arg.to_table(columns=[], with_row_id=True)[
-            "_rowid"
-        ].to_numpy()
-        part_ids = pa.array(np.zeros(len(row_ids), dtype=np.uint32))
-        pq_values = pa.array(np.zeros(len(row_ids) * 16, dtype=np.uint8))
-        pq_codes = pa.FixedSizeListArray.from_arrays(pq_values, 16)
-        shuffle_ds_uri = str(tmp_path / "cuvs_shuffle_buffers")
-        shuffle_ds = lance.write_dataset(
-            pa.Table.from_arrays(
-                [pa.array(row_ids), part_ids, pq_codes],
-                names=["row_id", "__ivf_part_id", "__pq_code"],
-            ),
-            shuffle_ds_uri,
-        )
-        lance_cuvs._annotate_precomputed_encoded_dataset(
-            shuffle_ds, [len(row_ids), 0, 0, 0]
-        )
-        shuffle_buffers = [
-            data_file.path
-            for frag in shuffle_ds.get_fragments()
-            for data_file in frag.data_files()
+        return str(tmp_path / "cuvs_artifact"), [
+            "manifest.json",
+            "metadata.lance",
+            "partitions/bucket-00000.lance",
         ]
-        return shuffle_ds_uri, shuffle_buffers
 
     monkeypatch.setattr(lance_cuvs, "_train_ivf_pq_index_on_cuvs", fake_train)
-    monkeypatch.setattr(lance_cuvs, "one_pass_assign_ivf_pq_on_cuvs", fake_assign)
+    monkeypatch.setattr(
+        lance_cuvs,
+        "one_pass_assign_ivf_pq_on_cuvs",
+        fake_assign,
+    )
 
     dataset = dataset.create_index(
         "vector",
@@ -712,6 +705,7 @@ def fake_prepare(
 
 
 def test_train_ivf_pq_on_cuvs_nullable_vectors(tmp_path, monkeypatch):
+    _disable_rust_cuvs_backend(monkeypatch)
     tbl = create_table(nvec=32, ndim=16, nullify=True)
     dataset = lance.write_dataset(tbl, tmp_path)
 
@@ -747,9 +741,59 @@ def build(build_params, matrix):
     assert pq_codebook.shape == (4, 256, 4)
 
 
+def test_train_ivf_pq_on_cuvs_prefers_rust_backend(tmp_path, monkeypatch):
+    dataset = lance.write_dataset(create_table(nvec=32, ndim=16), tmp_path)
+    calls = {}
+
+    class FakeRustIndex:
+        pass
+
+    def fake_train(*args, **kwargs):
+        calls["args"] = args
+        calls["kwargs"] = kwargs
+        return (
+            FakeRustIndex(),
+            pa.FixedSizeListArray.from_arrays(
+                pa.array(np.arange(64, dtype=np.float32)), 16
+            ),
+            pa.FixedSizeListArray.from_arrays(
+                pa.array(np.arange(4 * 256 * 4, dtype=np.float32)), 4
+            ),
+        )
+
+    monkeypatch.setattr(lance_cuvs, "_train_ivf_pq_on_cuvs_rust_impl", fake_train)
+    monkeypatch.setattr(lance_cuvs, "_assign_ivf_pq_on_cuvs_rust_impl", object())
+    monkeypatch.setattr(
+        lance_cuvs,
+        "_require_cuvs",
+        lambda: (_ for _ in ()).throw(AssertionError("python cuVS backend should not run")),
+    )
+
+    trained_index, centroids, pq_codebook = lance_cuvs._train_ivf_pq_index_on_cuvs(
+        dataset,
+        "vector",
+        4,
+        "l2",
+        "cuvs",
+        4,
+        sample_rate=8,
+        max_iters=30,
+        num_bits=8,
+        filter_nan=True,
+    )
+
+    assert isinstance(trained_index, FakeRustIndex)
+    assert calls["args"][:5] == (dataset, "vector", 4, "l2", 4)
+    assert calls["kwargs"]["sample_rate"] == 8
+    assert calls["kwargs"]["max_iters"] == 30
+    assert isinstance(centroids, pa.FixedSizeListArray)
+    assert isinstance(pq_codebook, pa.FixedSizeListArray)
+
+
 def test_train_ivf_pq_on_cuvs_uses_num_sub_vectors_for_pq_dim(
     tmp_path, monkeypatch
 ):
+    _disable_rust_cuvs_backend(monkeypatch)
     dataset = lance.write_dataset(create_table(nvec=32, ndim=16), tmp_path)
     calls = {}
 
@@ -836,7 +880,8 @@ def make_table(partition_ids: list[int], row_id_start: int):
     assert partition_fragments == [[0], [0], [1], [1]]
 
 
-def test_one_pass_assign_ivf_pq_on_cuvs_writes_encoded_dataset(tmp_path, monkeypatch):
+def test_one_pass_assign_ivf_pq_on_cuvs_writes_partition_artifact(tmp_path, monkeypatch):
+    _disable_rust_cuvs_backend(monkeypatch)
     tbl = create_table(nvec=32, ndim=16)
     dataset = lance.write_dataset(tbl, tmp_path / "cuvs_assign_src")
 
@@ -875,7 +920,7 @@ def transform(index, vectors):
     monkeypatch.setattr(lance_cuvs, "_require_cuvs", lambda: FakeIvfPqModule())
     monkeypatch.setattr(lance_cuvs, "_optional_cupy", lambda: FakeCupyModule())
 
-    shuffle_uri, shuffle_buffers = lance_cuvs.one_pass_assign_ivf_pq_on_cuvs(
+    artifact_root, artifact_files = lance_cuvs.one_pass_assign_ivf_pq_on_cuvs(
         dataset,
         "vector",
         "l2",
@@ -886,33 +931,82 @@ def transform(index, vectors):
         batch_size=8,
     )
 
-    shuffle_ds = lance.dataset(shuffle_uri)
-    data_batch = next(shuffle_ds.to_batches(batch_size=1024))
+    manifest_path = Path(artifact_root) / lance_cuvs.PARTITION_ARTIFACT_MANIFEST_FILE_NAME
+    metadata_path = Path(artifact_root) / lance_cuvs.PARTITION_ARTIFACT_METADATA_FILE_NAME
 
-    assert len(shuffle_buffers) > 0
-    assert all(path.endswith(".lance") for path in shuffle_buffers)
-    assert data_batch.column("row_id").type == pa.uint64()
-    assert data_batch.column("__ivf_part_id").type == pa.uint32()
-    assert data_batch.column("__pq_code").type == pa.list_(pa.uint8(), 4)
-    metadata = shuffle_ds.metadata()
-    assert json.loads(
-        metadata[
-            lance_cuvs.PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY
-        ]
-    ) == [8, 8, 8, 8]
-    partition_fragments = json.loads(
-        metadata[
-            lance_cuvs.PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY
-        ]
+    assert manifest_path.exists()
+    assert metadata_path.exists()
+    assert any(path.endswith(".lance") for path in artifact_files)
+
+    manifest = json.loads(manifest_path.read_text())
+    assert manifest["version"] == lance_cuvs.PARTITION_ARTIFACT_MANIFEST_VERSION
+    assert manifest["num_partitions"] == 4
+    assert manifest["metadata_file"] == lance_cuvs.PARTITION_ARTIFACT_METADATA_FILE_NAME
+    assert [entry["num_rows"] for entry in manifest["partitions"]] == [8, 8, 8, 8]
+    assert all(entry["path"] for entry in manifest["partitions"])
+    assert all(entry["ranges"] for entry in manifest["partitions"])
+
+    metadata_reader = LanceFileReader(str(metadata_path))
+    metadata_table = metadata_reader.read_all().to_table()
+    assert metadata_table.column("_ivf_centroids").type == pa.list_(pa.list_(pa.float32(), 16))
+    assert metadata_table.column("_pq_codebook").type == pa.list_(pa.list_(pa.float32(), 4))
+
+    bucket_path = Path(artifact_root) / manifest["partitions"][0]["path"]
+    bucket_reader = LanceFileReader(str(bucket_path))
+    bucket_table = bucket_reader.read_all().to_table()
+    assert bucket_table.column("_rowid").type == pa.uint64()
+    assert bucket_table.column("__pq_code").type == pa.list_(pa.uint8(), 4)
+
+
+def test_one_pass_assign_ivf_pq_on_cuvs_prefers_rust_backend(tmp_path, monkeypatch):
+    dataset = lance.write_dataset(create_table(nvec=32, ndim=16), tmp_path / "cuvs_assign_rust")
+    calls = {}
+
+    class FakeRustIndex:
+        pass
+
+    def fake_assign(*args, **kwargs):
+        calls["args"] = args
+        calls["kwargs"] = kwargs
+        return ["manifest.json", "metadata.lance", "partitions/bucket-00000.lance"]
+
+    monkeypatch.setattr(lance_cuvs, "_train_ivf_pq_on_cuvs_rust_impl", object())
+    monkeypatch.setattr(lance_cuvs, "_assign_ivf_pq_on_cuvs_rust_impl", fake_assign)
+    monkeypatch.setattr(
+        lance_cuvs,
+        "_require_cuvs",
+        lambda: (_ for _ in ()).throw(AssertionError("python cuVS backend should not run")),
+    )
+
+    artifact_root, artifact_files = lance_cuvs.one_pass_assign_ivf_pq_on_cuvs(
+        dataset,
+        "vector",
+        "l2",
+        "cuvs",
+        np.random.randn(4, 16).astype(np.float32),
+        np.random.randn(4, 256, 4).astype(np.float32),
+        trained_index=FakeRustIndex(),
+        dst_dataset_uri=tmp_path / "artifact",
+        batch_size=4096,
+    )
+
+    assert artifact_root == str(tmp_path / "artifact")
+    assert artifact_files[0] == "manifest.json"
+    assert calls["args"][:4] == (
+        dataset,
+        "vector",
+        calls["args"][2],
+        str(tmp_path / "artifact"),
     )
-    assert len(partition_fragments) == 4
-    assert all(partition_fragments)
+    assert isinstance(calls["args"][2], FakeRustIndex)
+    assert calls["kwargs"]["batch_size"] == 4096
 
 
 def test_one_pass_assign_ivf_pq_on_cuvs_rejects_incompatible_transform_width(
     tmp_path,
     monkeypatch,
 ):
+    _disable_rust_cuvs_backend(monkeypatch)
     tbl = create_table(nvec=32, ndim=128)
     dataset = lance.write_dataset(tbl, tmp_path / "cuvs_assign_incompatible")
 
diff --git a/python/src/dataset.rs b/python/src/dataset.rs
index 62e852c117e..19c3e4ec5d4 100644
--- a/python/src/dataset.rs
+++ b/python/src/dataset.rs
@@ -3375,6 +3375,10 @@ fn prepare_vector_index_params(
             ivf_params.precomputed_encoded_dataset_uri = Some(uri.to_string());
         };
 
+        if let Some(uri) = kwargs.get_item("precomputed_partition_artifact_uri")? {
+            ivf_params.precomputed_partition_artifact_uri = Some(uri.to_string());
+        };
+
         if let Some(storage_options) = storage_options {
             ivf_params.storage_options = Some(storage_options);
         }
diff --git a/python/src/file.rs b/python/src/file.rs
index da8ba3e76bb..eb830dc4a73 100644
--- a/python/src/file.rs
+++ b/python/src/file.rs
@@ -18,6 +18,7 @@ use arrow_array::{RecordBatch, RecordBatchReader, UInt32Array};
 use arrow_schema::Schema as ArrowSchema;
 use bytes::Bytes;
 use futures::stream::StreamExt;
+use lance::index::vector::PartitionArtifactBuilder as CorePartitionArtifactBuilder;
 use lance::io::{ObjectStore, RecordBatchStream};
 use lance_core::cache::LanceCache;
 use lance_core::utils::path::LancePathExt;
@@ -370,6 +371,82 @@ impl Drop for LanceFileWriter {
     }
 }
 
+#[pyclass]
+pub struct PartitionArtifactBuilder {
+    inner: Arc<Mutex<CorePartitionArtifactBuilder>>,
+}
+
+impl PartitionArtifactBuilder {
+    #[allow(clippy::too_many_arguments)]
+    async fn open(
+        uri_or_path: String,
+        num_partitions: usize,
+        pq_code_width: usize,
+        storage_options: Option<HashMap<String, String>>,
+        storage_options_provider: Option<Arc<dyn lance_io::object_store::StorageOptionsProvider>>,
+    ) -> PyResult<Self> {
+        let (object_store, path) = object_store_from_uri_or_path_with_provider(
+            uri_or_path,
+            storage_options,
+            storage_options_provider,
+        )
+        .await?;
+        let inner = CorePartitionArtifactBuilder::try_new_with_store(
+            object_store,
+            path,
+            num_partitions,
+            pq_code_width,
+        )
+        .infer_error()?;
+        Ok(Self {
+            inner: Arc::new(Mutex::new(inner)),
+        })
+    }
+}
+
+#[pymethods]
+impl PartitionArtifactBuilder {
+    #[new]
+    #[pyo3(signature=(uri_or_path, num_partitions, pq_code_width, storage_options=None, storage_options_provider=None))]
+    #[allow(clippy::too_many_arguments)]
+    pub fn new(
+        uri_or_path: String,
+        num_partitions: usize,
+        pq_code_width: usize,
+        storage_options: Option<HashMap<String, String>>,
+        storage_options_provider: Option<&Bound<'_, PyAny>>,
+    ) -> PyResult<Self> {
+        let provider = storage_options_provider
+            .map(crate::storage_options::py_object_to_storage_options_provider)
+            .transpose()?;
+        rt().block_on(
+            None,
+            Self::open(
+                uri_or_path,
+                num_partitions,
+                pq_code_width,
+                storage_options,
+                provider,
+            ),
+        )?
+    }
+
+    pub fn append_batch(&self, batch: PyArrowType<RecordBatch>) -> PyResult<()> {
+        rt().runtime.block_on(async {
+            self.inner.lock().await.append_batch(&batch.0).await
+        })
+        .infer_error()
+    }
+
+    #[pyo3(signature=(metadata_file, total_loss=None))]
+    pub fn finish(&self, metadata_file: String, total_loss: Option<f64>) -> PyResult<Vec<String>> {
+        rt().runtime.block_on(async {
+            self.inner.lock().await.finish(&metadata_file, total_loss).await
+        })
+        .infer_error()
+    }
+}
+
 pub async fn object_store_from_uri_or_path_no_options(
     uri_or_path: impl AsRef<str>,
 ) -> PyResult<(Arc<ObjectStore>, Path)> {
diff --git a/python/src/indices.rs b/python/src/indices.rs
index cea7f2a968a..cb8288b51a5 100644
--- a/python/src/indices.rs
+++ b/python/src/indices.rs
@@ -32,6 +32,8 @@ use pyo3::{
 };
 
 use lance::index::DatasetIndexInternalExt;
+#[cfg(feature = "cuvs")]
+use lance_cuvs::TrainedIvfPqIndex;
 
 use crate::fragment::FileFragment;
 use crate::utils::{PyJson, PyLance};
@@ -155,6 +157,89 @@ impl PyIvfModel {
     }
 }
 
+#[cfg(feature = "cuvs")]
+#[pyclass(name = "_CuvsIvfPqIndex", module = "lance.indices", unsendable)]
+pub struct PyCuvsIvfPqIndex {
+    inner: TrainedIvfPqIndex,
+}
+
+#[cfg(feature = "cuvs")]
+#[pyfunction]
+#[allow(clippy::too_many_arguments)]
+#[pyo3(
+    signature=(
+        dataset,
+        column,
+        num_partitions,
+        distance_type,
+        num_sub_vectors,
+        sample_rate=256,
+        max_iters=50,
+        num_bits=8,
+        filter_nan=true
+    )
+)]
+fn _train_ivf_pq_on_cuvs_rust<'py>(
+    py: Python<'py>,
+    dataset: &Dataset,
+    column: &str,
+    num_partitions: u32,
+    distance_type: &str,
+    num_sub_vectors: u32,
+    sample_rate: u32,
+    max_iters: u32,
+    num_bits: u8,
+    filter_nan: bool,
+) -> PyResult<(Py<PyCuvsIvfPqIndex>, Bound<'py, PyAny>, Bound<'py, PyAny>)> {
+    let distance_type = DistanceType::try_from(distance_type).unwrap();
+    let trained = rt()
+        .runtime
+        .block_on(lance_cuvs::train_ivf_pq(
+            dataset.ds.as_ref(),
+            column,
+            num_partitions as usize,
+            distance_type,
+            num_sub_vectors as usize,
+            sample_rate as usize,
+            max_iters as usize,
+            num_bits as usize,
+            filter_nan,
+        ))
+        .infer_error()?;
+    let ivf_centroids = trained.ivf_centroids().clone().into_data().to_pyarrow(py)?;
+    let pq_codebook = trained.pq_codebook().clone().into_data().to_pyarrow(py)?;
+    Ok((
+        Py::new(py, PyCuvsIvfPqIndex { inner: trained })?,
+        ivf_centroids,
+        pq_codebook,
+    ))
+}
+
+#[cfg(feature = "cuvs")]
+#[pyfunction]
+#[pyo3(signature=(dataset, column, trained_index, artifact_root, batch_size=1024 * 128, filter_nan=true))]
+fn _assign_ivf_pq_on_cuvs_rust(
+    py: Python<'_>,
+    dataset: &Dataset,
+    column: &str,
+    trained_index: &PyCuvsIvfPqIndex,
+    artifact_root: &str,
+    batch_size: usize,
+    filter_nan: bool,
+) -> PyResult<Vec<String>> {
+    let _ = py;
+    rt().runtime
+        .block_on(lance_cuvs::assign_ivf_pq_to_artifact(
+            dataset.ds.as_ref(),
+            column,
+            &trained_index.inner,
+            artifact_root,
+            batch_size,
+            filter_nan,
+        ))
+        .infer_error()
+}
+
 /// Internal helper to fetch an IVF model for the given index name.
 async fn do_get_ivf_model(dataset: &Dataset, index_name: &str) -> PyResult<IvfModel> {
     use lance_index::metrics::NoOpMetricsCollector;
@@ -716,6 +801,12 @@ pub fn register_indices(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
     indices.add_class::<PyIndexDescription>()?;
     indices.add_class::<PyIndexSegmentDescription>()?;
     indices.add_wrapped(wrap_pyfunction!(get_ivf_model))?;
+    #[cfg(feature = "cuvs")]
+    {
+        indices.add_class::<PyCuvsIvfPqIndex>()?;
+        indices.add_wrapped(wrap_pyfunction!(_train_ivf_pq_on_cuvs_rust))?;
+        indices.add_wrapped(wrap_pyfunction!(_assign_ivf_pq_on_cuvs_rust))?;
+    }
     m.add_submodule(&indices)?;
     Ok(())
 }
diff --git a/python/src/lib.rs b/python/src/lib.rs
index 9730f2ba1c5..819e3fddc3e 100644
--- a/python/src/lib.rs
+++ b/python/src/lib.rs
@@ -51,7 +51,8 @@ use dataset::{
 use env_logger::{Builder, Env};
 use file::{
     LanceBufferDescriptor, LanceColumnMetadata, LanceFileMetadata, LanceFileReader,
-    LanceFileStatistics, LanceFileWriter, LancePageMetadata, stable_version,
+    LanceFileStatistics, LanceFileWriter, LancePageMetadata, PartitionArtifactBuilder,
+    stable_version,
 };
 use log::Level;
 use pyo3::exceptions::PyIOError;
@@ -258,6 +259,7 @@ fn lance(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<LanceBlobFile>()?;
     m.add_class::<LanceFileReader>()?;
     m.add_class::<LanceFileWriter>()?;
+    m.add_class::<PartitionArtifactBuilder>()?;
     m.add_class::<LanceFileSession>()?;
     m.add_class::<LanceFileMetadata>()?;
     m.add_class::<LanceFileStatistics>()?;
diff --git a/rust/lance-cuvs/Cargo.toml b/rust/lance-cuvs/Cargo.toml
new file mode 100644
index 00000000000..a001f82c16f
--- /dev/null
+++ b/rust/lance-cuvs/Cargo.toml
@@ -0,0 +1,28 @@
+[package]
+name = "lance-cuvs"
+version = "5.0.0-beta.2"
+edition = "2024"
+authors = ["Lance Devs <dev@lance.org>"]
+license = "Apache-2.0"
+rust-version = "1.91"
+publish = false
+
+[dependencies]
+arrow = "57.0.0"
+arrow-array = "57.0.0"
+arrow-buffer = "57.0.0"
+arrow-schema = "57.0.0"
+cuvs = "26.2.0"
+cuvs-sys = "26.2.0"
+futures = "0.3"
+half = { version = "2.5", default-features = false, features = ["num-traits", "std"] }
+lance = { path = "../lance" }
+lance-arrow = { path = "../lance-arrow" }
+lance-core = { path = "../lance-core" }
+lance-file = { path = "../lance-file" }
+lance-index = { path = "../lance-index" }
+lance-io = { path = "../lance-io" }
+lance-linalg = { path = "../lance-linalg" }
+log = "0.4"
+ndarray = { version = "0.16.1", features = ["matrixmultiply-threading"] }
+tokio = { version = "1.48", features = ["rt-multi-thread"] }
diff --git a/rust/lance-cuvs/src/lib.rs b/rust/lance-cuvs/src/lib.rs
new file mode 100644
index 00000000000..db54ce47f22
--- /dev/null
+++ b/rust/lance-cuvs/src/lib.rs
@@ -0,0 +1,1237 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+use std::ffi::{CStr, c_void};
+use std::marker::PhantomData;
+use std::ptr;
+use std::sync::Arc;
+use arrow::compute::filter;
+use arrow_array::cast::AsArray;
+use arrow_array::types::{Float16Type, Float32Type, Float64Type, UInt8Type};
+use arrow_array::{
+    Array, FixedSizeListArray, Float32Array, ListArray, RecordBatch, UInt8Array, UInt32Array,
+    UInt64Array,
+};
+use arrow_buffer::{OffsetBuffer, ScalarBuffer};
+use arrow_schema::{DataType, Field, Schema as ArrowSchema};
+use cuvs::Resources;
+use futures::TryStreamExt;
+use lance::dataset::Dataset;
+use lance::index::vector::PartitionArtifactBuilder;
+use lance::index::vector::utils::{infer_vector_dim, vector_column_to_fsl};
+use lance_arrow::{FixedSizeListArrayExt, RecordBatchExt};
+use lance_core::{Error, ROW_ID, Result};
+use lance_file::version::LanceFileVersion;
+use lance_file::writer::{FileWriter, FileWriterOptions};
+use lance_index::vector::utils::is_finite;
+use lance_index::vector::{LOSS_METADATA_KEY, PART_ID_COLUMN, PQ_CODE_COLUMN};
+use lance_linalg::distance::DistanceType;
+use log::warn;
+use ndarray::{Array2, ArrayView2};
+
+const PARTITION_ARTIFACT_METADATA_FILE_NAME: &str = "metadata.lance";
+const PARTITION_ARTIFACT_FILE_VERSION: &str = "2.2";
+const PIPELINE_SLOTS: usize = 2;
+
+type CudaEventHandle = *mut c_void;
+
+#[link(name = "cudart")]
+unsafe extern "C" {
+    fn cudaMallocHost(ptr: *mut *mut c_void, size: usize) -> cuvs_sys::cudaError_t;
+    fn cudaFreeHost(ptr: *mut c_void) -> cuvs_sys::cudaError_t;
+    fn cudaEventCreate(event: *mut CudaEventHandle) -> cuvs_sys::cudaError_t;
+    fn cudaEventDestroy(event: CudaEventHandle) -> cuvs_sys::cudaError_t;
+    fn cudaEventRecord(
+        event: CudaEventHandle,
+        stream: cuvs_sys::cudaStream_t,
+    ) -> cuvs_sys::cudaError_t;
+    fn cudaEventSynchronize(event: CudaEventHandle) -> cuvs_sys::cudaError_t;
+}
+
+pub struct TrainedIvfPqIndex {
+    resources: Resources,
+    index: CuvsIvfPqIndex,
+    num_partitions: usize,
+    dimension: usize,
+    num_sub_vectors: usize,
+    num_bits: usize,
+    metric_type: DistanceType,
+    ivf_centroids: FixedSizeListArray,
+    pq_codebook: FixedSizeListArray,
+}
+
+impl TrainedIvfPqIndex {
+    pub fn ivf_centroids(&self) -> &FixedSizeListArray {
+        &self.ivf_centroids
+    }
+
+    pub fn pq_codebook(&self) -> &FixedSizeListArray {
+        &self.pq_codebook
+    }
+
+    pub fn num_partitions(&self) -> usize {
+        self.num_partitions
+    }
+
+    pub fn pq_code_width(&self) -> usize {
+        self.num_sub_vectors
+    }
+
+    pub fn metric_type(&self) -> DistanceType {
+        self.metric_type
+    }
+
+    pub fn num_bits(&self) -> usize {
+        self.num_bits
+    }
+}
+
+struct CuvsIvfPqIndex {
+    raw: cuvs_sys::cuvsIvfPqIndex_t,
+}
+
+impl CuvsIvfPqIndex {
+    fn try_new() -> Result<Self> {
+        let mut raw = ptr::null_mut();
+        check_cuvs(
+            unsafe { cuvs_sys::cuvsIvfPqIndexCreate(&mut raw) },
+            "create IVF_PQ index",
+        )?;
+        Ok(Self { raw })
+    }
+}
+
+impl Drop for CuvsIvfPqIndex {
+    fn drop(&mut self) {
+        if !self.raw.is_null() {
+            let _ = unsafe { cuvs_sys::cuvsIvfPqIndexDestroy(self.raw) };
+        }
+    }
+}
+
+enum MatrixBuffer<'a> {
+    Borrowed {
+        values: &'a [f32],
+        rows: usize,
+        cols: usize,
+    },
+    Owned(Array2<f32>),
+}
+
+impl MatrixBuffer<'_> {
+    fn view(&self) -> Result<ArrayView2<'_, f32>> {
+        match self {
+            Self::Borrowed { values, rows, cols } => ArrayView2::from_shape((*rows, *cols), values)
+                .map_err(|error| {
+                    Error::io(format!("failed to create borrowed matrix view: {error}"))
+                }),
+            Self::Owned(array) => Ok(array.view()),
+        }
+    }
+
+    fn rows(&self) -> usize {
+        match self {
+            Self::Borrowed { rows, .. } => *rows,
+            Self::Owned(array) => array.nrows(),
+        }
+    }
+}
+
+struct HostTensorView {
+    shape: Vec<i64>,
+    tensor: cuvs_sys::DLManagedTensor,
+}
+
+impl HostTensorView {
+    fn try_new<T: DlElement>(shape: &[usize], data: *mut std::ffi::c_void) -> Self {
+        let shape = shape.iter().map(|dim| *dim as i64).collect::<Vec<_>>();
+        let tensor = cuvs_sys::DLManagedTensor {
+            dl_tensor: cuvs_sys::DLTensor {
+                data,
+                device: cuvs_sys::DLDevice {
+                    device_type: cuvs_sys::DLDeviceType::kDLCPU,
+                    device_id: 0,
+                },
+                ndim: shape.len() as i32,
+                dtype: T::dl_dtype(),
+                shape: shape.as_ptr() as *mut i64,
+                strides: ptr::null_mut(),
+                byte_offset: 0,
+            },
+            manager_ctx: ptr::null_mut(),
+            deleter: None,
+        };
+        Self { shape, tensor }
+    }
+
+    fn as_mut_ptr(&mut self) -> *mut cuvs_sys::DLManagedTensor {
+        debug_assert_eq!(self.shape.len(), self.tensor.dl_tensor.ndim as usize);
+        &mut self.tensor
+    }
+}
+
+trait DlElement: Copy + Default {
+    fn dl_dtype() -> cuvs_sys::DLDataType;
+}
+
+impl DlElement for f32 {
+    fn dl_dtype() -> cuvs_sys::DLDataType {
+        cuvs_sys::DLDataType {
+            code: cuvs_sys::DLDataTypeCode::kDLFloat as u8,
+            bits: 32,
+            lanes: 1,
+        }
+    }
+}
+
+impl DlElement for u8 {
+    fn dl_dtype() -> cuvs_sys::DLDataType {
+        cuvs_sys::DLDataType {
+            code: cuvs_sys::DLDataTypeCode::kDLUInt as u8,
+            bits: 8,
+            lanes: 1,
+        }
+    }
+}
+
+impl DlElement for u32 {
+    fn dl_dtype() -> cuvs_sys::DLDataType {
+        cuvs_sys::DLDataType {
+            code: cuvs_sys::DLDataTypeCode::kDLUInt as u8,
+            bits: 32,
+            lanes: 1,
+        }
+    }
+}
+
+struct DeviceTensor<T: DlElement> {
+    shape: Vec<i64>,
+    tensor: cuvs_sys::DLManagedTensor,
+    capacity_bytes: usize,
+    resources: cuvs_sys::cuvsResources_t,
+    _marker: PhantomData<T>,
+}
+
+impl<T: DlElement> DeviceTensor<T> {
+    fn try_new(resources: &Resources, shape: &[usize]) -> Result<Self> {
+        let capacity_bytes = shape.iter().product::<usize>() * std::mem::size_of::<T>();
+        let mut data = ptr::null_mut();
+        check_cuvs(
+            unsafe { cuvs_sys::cuvsRMMAlloc(resources.0, &mut data, capacity_bytes) },
+            "allocate device tensor",
+        )?;
+        let shape = shape.iter().map(|dim| *dim as i64).collect::<Vec<_>>();
+        let tensor = cuvs_sys::DLManagedTensor {
+            dl_tensor: cuvs_sys::DLTensor {
+                data,
+                device: cuvs_sys::DLDevice {
+                    device_type: cuvs_sys::DLDeviceType::kDLCUDA,
+                    device_id: 0,
+                },
+                ndim: shape.len() as i32,
+                dtype: T::dl_dtype(),
+                shape: shape.as_ptr() as *mut i64,
+                strides: ptr::null_mut(),
+                byte_offset: 0,
+            },
+            manager_ctx: ptr::null_mut(),
+            deleter: None,
+        };
+        Ok(Self {
+            shape,
+            tensor,
+            capacity_bytes,
+            resources: resources.0,
+            _marker: PhantomData,
+        })
+    }
+
+    fn as_mut_ptr(&mut self) -> *mut cuvs_sys::DLManagedTensor {
+        debug_assert_eq!(self.shape.len(), self.tensor.dl_tensor.ndim as usize);
+        &mut self.tensor
+    }
+
+    fn set_shape(&mut self, shape: &[usize]) -> Result<()> {
+        if shape.len() != self.shape.len() {
+            return Err(Error::io(format!(
+                "device tensor rank mismatch: expected {}, got {}",
+                self.shape.len(),
+                shape.len()
+            )));
+        }
+        let required_bytes = shape.iter().product::<usize>() * std::mem::size_of::<T>();
+        if required_bytes > self.capacity_bytes {
+            return Err(Error::io(format!(
+                "device tensor capacity {} bytes is smaller than requested shape {:?} ({} bytes)",
+                self.capacity_bytes, shape, required_bytes
+            )));
+        }
+        for (dst, src) in self.shape.iter_mut().zip(shape) {
+            *dst = *src as i64;
+        }
+        Ok(())
+    }
+
+    fn current_len(&self) -> usize {
+        self.shape.iter().map(|dim| *dim as usize).product()
+    }
+
+    fn current_bytes(&self) -> usize {
+        self.current_len() * std::mem::size_of::<T>()
+    }
+
+    fn copy_from_host_async(&mut self, resources: &Resources, src: &[T]) -> Result<()> {
+        let expected_len = self.current_len();
+        if src.len() != expected_len {
+            return Err(Error::io(format!(
+                "device tensor copy expects {expected_len} elements, got {}",
+                src.len()
+            )));
+        }
+        check_cuda(
+            unsafe {
+                cuvs_sys::cudaMemcpyAsync(
+                    self.tensor.dl_tensor.data,
+                    src.as_ptr() as *const _,
+                    self.current_bytes(),
+                    cuvs_sys::cudaMemcpyKind_cudaMemcpyDefault,
+                    resources
+                        .get_cuda_stream()
+                        .map_err(|e| Error::io(e.to_string()))?,
+                )
+            },
+            "copy host tensor to device",
+        )
+    }
+
+    fn copy_to_host_async(&self, resources: &Resources, dst: &mut [T]) -> Result<()> {
+        let expected_len = self.current_len();
+        if dst.len() != expected_len {
+            return Err(Error::io(format!(
+                "device tensor copy expects destination length {expected_len}, got {}",
+                dst.len()
+            )));
+        }
+        check_cuda(
+            unsafe {
+                cuvs_sys::cudaMemcpyAsync(
+                    dst.as_mut_ptr() as *mut _,
+                    self.tensor.dl_tensor.data,
+                    self.current_bytes(),
+                    cuvs_sys::cudaMemcpyKind_cudaMemcpyDefault,
+                    resources
+                        .get_cuda_stream()
+                        .map_err(|e| Error::io(e.to_string()))?,
+                )
+            },
+            "copy device tensor to host",
+        )
+    }
+}
+
+impl<T: DlElement> Drop for DeviceTensor<T> {
+    fn drop(&mut self) {
+        if !self.tensor.dl_tensor.data.is_null() {
+            let _ = unsafe {
+                cuvs_sys::cuvsRMMFree(
+                    self.resources,
+                    self.tensor.dl_tensor.data,
+                    self.capacity_bytes,
+                )
+            };
+        }
+    }
+}
+
+struct PinnedHostBuffer<T> {
+    ptr: *mut T,
+    len: usize,
+    _marker: PhantomData<T>,
+}
+
+impl<T: Copy> PinnedHostBuffer<T> {
+    fn try_new(len: usize) -> Result<Self> {
+        let bytes = len
+            .checked_mul(std::mem::size_of::<T>())
+            .ok_or_else(|| Error::io("pinned host allocation size overflow"))?;
+        let mut raw = ptr::null_mut();
+        check_cuda(
+            unsafe { cudaMallocHost(&mut raw, bytes) },
+            "allocate pinned host buffer",
+        )?;
+        Ok(Self {
+            ptr: raw.cast::<T>(),
+            len,
+            _marker: PhantomData,
+        })
+    }
+
+    fn as_slice(&self) -> &[T] {
+        unsafe { std::slice::from_raw_parts(self.ptr, self.len) }
+    }
+
+    fn as_mut_slice(&mut self) -> &mut [T] {
+        unsafe { std::slice::from_raw_parts_mut(self.ptr, self.len) }
+    }
+
+    fn prefix(&self, len: usize) -> Result<&[T]> {
+        if len > self.len {
+            return Err(Error::io(format!(
+                "pinned host buffer length {} is smaller than requested prefix {}",
+                self.len, len
+            )));
+        }
+        Ok(&self.as_slice()[..len])
+    }
+
+    fn prefix_mut(&mut self, len: usize) -> Result<&mut [T]> {
+        if len > self.len {
+            return Err(Error::io(format!(
+                "pinned host buffer length {} is smaller than requested prefix {}",
+                self.len, len
+            )));
+        }
+        Ok(&mut self.as_mut_slice()[..len])
+    }
+
+    fn copy_from_slice(&mut self, src: &[T]) -> Result<()> {
+        if src.len() > self.len {
+            return Err(Error::io(format!(
+                "pinned host buffer length {} is smaller than source length {}",
+                self.len,
+                src.len()
+            )));
+        }
+        self.prefix_mut(src.len())?.copy_from_slice(src);
+        Ok(())
+    }
+}
+
+impl<T> Drop for PinnedHostBuffer<T> {
+    fn drop(&mut self) {
+        if !self.ptr.is_null() {
+            let _ = unsafe { cudaFreeHost(self.ptr.cast::<c_void>()) };
+        }
+    }
+}
+
+struct CudaEvent {
+    raw: CudaEventHandle,
+}
+
+impl CudaEvent {
+    fn try_new() -> Result<Self> {
+        let mut raw = ptr::null_mut();
+        check_cuda(unsafe { cudaEventCreate(&mut raw) }, "create CUDA event")?;
+        Ok(Self { raw })
+    }
+
+    fn record(&self, stream: cuvs_sys::cudaStream_t) -> Result<()> {
+        check_cuda(
+            unsafe { cudaEventRecord(self.raw, stream) },
+            "record CUDA event",
+        )
+    }
+
+    fn synchronize(&self) -> Result<()> {
+        check_cuda(
+            unsafe { cudaEventSynchronize(self.raw) },
+            "synchronize CUDA event",
+        )
+    }
+
+}
+
+impl Drop for CudaEvent {
+    fn drop(&mut self) {
+        if !self.raw.is_null() {
+            let _ = unsafe { cudaEventDestroy(self.raw) };
+        }
+    }
+}
+
+fn check_cuvs(status: cuvs_sys::cuvsError_t, context: &str) -> Result<()> {
+    if status == cuvs_sys::cuvsError_t::CUVS_SUCCESS {
+        return Ok(());
+    }
+
+    let message = unsafe {
+        let text = cuvs_sys::cuvsGetLastErrorText();
+        if text.is_null() {
+            format!("{status:?}")
+        } else {
+            format!(
+                "{status:?}: {}",
+                CStr::from_ptr(text).to_string_lossy().into_owned()
+            )
+        }
+    };
+    Err(Error::io(format!("cuVS failed to {context}: {message}")))
+}
+
+fn check_cuda(status: cuvs_sys::cudaError_t, context: &str) -> Result<()> {
+    if status == cuvs_sys::cudaError::cudaSuccess {
+        Ok(())
+    } else {
+        Err(Error::io(format!("CUDA failed to {context}: {status:?}")))
+    }
+}
+
+fn cuvs_distance_type(metric_type: DistanceType) -> Result<cuvs_sys::cuvsDistanceType> {
+    match metric_type {
+        DistanceType::L2 => Ok(cuvs_sys::cuvsDistanceType::L2Expanded),
+        DistanceType::Cosine => Ok(cuvs_sys::cuvsDistanceType::CosineExpanded),
+        DistanceType::Dot => Ok(cuvs_sys::cuvsDistanceType::InnerProduct),
+        other => Err(Error::not_supported(format!(
+            "cuVS IVF_PQ does not support metric {other:?}"
+        ))),
+    }
+}
+
+fn create_index_params(
+    metric_type: DistanceType,
+    num_partitions: usize,
+    num_sub_vectors: usize,
+    sample_rate: usize,
+    max_iters: usize,
+    num_bits: usize,
+) -> Result<cuvs_sys::cuvsIvfPqIndexParams_t> {
+    let mut params = ptr::null_mut();
+    check_cuvs(
+        unsafe { cuvs_sys::cuvsIvfPqIndexParamsCreate(&mut params) },
+        "allocate IVF_PQ index params",
+    )?;
+    let metric = cuvs_distance_type(metric_type)?;
+    unsafe {
+        (*params).metric = metric;
+        (*params).metric_arg = 0.0;
+        (*params).add_data_on_build = false;
+        (*params).n_lists = num_partitions as u32;
+        (*params).kmeans_n_iters = max_iters as u32;
+        (*params).kmeans_trainset_fraction = 1.0;
+        (*params).pq_bits = num_bits as u32;
+        (*params).pq_dim = num_sub_vectors as u32;
+        (*params).codebook_kind =
+            cuvs_sys::cuvsIvfPqCodebookGen::CUVS_IVF_PQ_CODEBOOK_GEN_PER_SUBSPACE;
+        (*params).force_random_rotation = false;
+        (*params).conservative_memory_allocation = false;
+        (*params).max_train_points_per_pq_code = sample_rate as u32;
+        (*params).codes_layout = cuvs_sys::cuvsIvfPqListLayout::CUVS_IVF_PQ_LIST_LAYOUT_FLAT;
+    }
+    Ok(params)
+}
+
+fn destroy_index_params(params: cuvs_sys::cuvsIvfPqIndexParams_t) {
+    if !params.is_null() {
+        let _ = unsafe { cuvs_sys::cuvsIvfPqIndexParamsDestroy(params) };
+    }
+}
+
+fn make_tensor_view() -> HostTensorView {
+    let shape = Vec::new();
+    let tensor = cuvs_sys::DLManagedTensor {
+        dl_tensor: cuvs_sys::DLTensor {
+            data: ptr::null_mut(),
+            device: cuvs_sys::DLDevice {
+                device_type: cuvs_sys::DLDeviceType::kDLCPU,
+                device_id: 0,
+            },
+            ndim: 0,
+            dtype: <f32 as DlElement>::dl_dtype(),
+            shape: shape.as_ptr() as *mut i64,
+            strides: ptr::null_mut(),
+            byte_offset: 0,
+        },
+        manager_ctx: ptr::null_mut(),
+        deleter: None,
+    };
+    HostTensorView { shape, tensor }
+}
+
+fn tensor_shape(tensor: &cuvs_sys::DLManagedTensor) -> Vec<usize> {
+    let dl_tensor = &tensor.dl_tensor;
+    (0..dl_tensor.ndim)
+        .map(|idx| unsafe { *dl_tensor.shape.add(idx as usize) as usize })
+        .collect()
+}
+
+fn tensor_num_bytes(tensor: &cuvs_sys::DLManagedTensor) -> usize {
+    let shape = tensor_shape(tensor);
+    let numel = shape.into_iter().product::<usize>();
+    numel * ((tensor.dl_tensor.dtype.bits as usize) / 8)
+}
+
+fn copy_tensor_to_host_f32_2d(
+    resources: &Resources,
+    tensor: &cuvs_sys::DLManagedTensor,
+) -> Result<Array2<f32>> {
+    let shape = tensor_shape(tensor);
+    if shape.len() != 2 {
+        return Err(Error::io(format!(
+            "expected 2D tensor, got shape {shape:?}"
+        )));
+    }
+    let mut array = Array2::<f32>::zeros((shape[0], shape[1]));
+    check_cuda(
+        unsafe {
+            cuvs_sys::cudaMemcpyAsync(
+                array.as_mut_ptr() as *mut _,
+                tensor.dl_tensor.data,
+                tensor_num_bytes(tensor),
+                cuvs_sys::cudaMemcpyKind_cudaMemcpyDefault,
+                resources
+                    .get_cuda_stream()
+                    .map_err(|e| Error::io(e.to_string()))?,
+            )
+        },
+        "copy tensor to host",
+    )?;
+    resources
+        .sync_stream()
+        .map_err(|e| Error::io(e.to_string()))?;
+    Ok(array)
+}
+
+fn copy_tensor_to_host_f32_3d(
+    resources: &Resources,
+    tensor: &cuvs_sys::DLManagedTensor,
+) -> Result<(Vec<f32>, [usize; 3])> {
+    let shape = tensor_shape(tensor);
+    if shape.len() != 3 {
+        return Err(Error::io(format!(
+            "expected 3D tensor, got shape {shape:?}"
+        )));
+    }
+    let mut values = vec![0.0f32; shape.iter().product()];
+    check_cuda(
+        unsafe {
+            cuvs_sys::cudaMemcpyAsync(
+                values.as_mut_ptr() as *mut _,
+                tensor.dl_tensor.data,
+                tensor_num_bytes(tensor),
+                cuvs_sys::cudaMemcpyKind_cudaMemcpyDefault,
+                resources
+                    .get_cuda_stream()
+                    .map_err(|e| Error::io(e.to_string()))?,
+            )
+        },
+        "copy tensor to host",
+    )?;
+    resources
+        .sync_stream()
+        .map_err(|e| Error::io(e.to_string()))?;
+    Ok((values, [shape[0], shape[1], shape[2]]))
+}
+
+fn infer_dimension(dataset: &Dataset, column: &str) -> Result<usize> {
+    let field = dataset.schema().field(column).ok_or_else(|| {
+        Error::invalid_input(format!(
+            "column '{column}' does not exist in dataset schema"
+        ))
+    })?;
+    infer_vector_dim(&field.data_type())
+}
+
+fn matrix_from_vectors<'a>(vectors: &'a FixedSizeListArray) -> Result<MatrixBuffer<'a>> {
+    let dim = vectors.value_length() as usize;
+    match vectors.value_type() {
+        DataType::Float32 => {
+            let values = vectors.values().as_primitive::<Float32Type>();
+            let values: &[f32] = values.values().as_ref();
+            Ok(MatrixBuffer::Borrowed {
+                values,
+                rows: vectors.len(),
+                cols: dim,
+            })
+        }
+        DataType::Float16 => {
+            let values = vectors.values().as_primitive::<Float16Type>();
+            let data = values
+                .values()
+                .iter()
+                .map(|value| value.to_f32())
+                .collect::<Vec<_>>();
+            Ok(MatrixBuffer::Owned(
+                Array2::from_shape_vec((vectors.len(), dim), data).map_err(|error| {
+                    Error::io(format!("failed to create float16 matrix copy: {error}"))
+                })?,
+            ))
+        }
+        DataType::Float64 => {
+            let values = vectors.values().as_primitive::<Float64Type>();
+            let data = values
+                .values()
+                .iter()
+                .map(|value| *value as f32)
+                .collect::<Vec<_>>();
+            Ok(MatrixBuffer::Owned(
+                Array2::from_shape_vec((vectors.len(), dim), data).map_err(|error| {
+                    Error::io(format!("failed to create float64 matrix copy: {error}"))
+                })?,
+            ))
+        }
+        other => Err(Error::not_supported(format!(
+            "cuVS IVF_PQ currently supports float16/float32/float64 vectors, got {other}"
+        ))),
+    }
+}
+
+fn ivf_centroids_from_host(array: Array2<f32>) -> Result<FixedSizeListArray> {
+    let dim = array.ncols() as i32;
+    let values = Float32Array::from_iter_values(array.into_iter());
+    Ok(FixedSizeListArray::try_new_from_values(values, dim)?)
+}
+
+fn pq_codebook_from_host(
+    values: Vec<f32>,
+    shape: [usize; 3],
+    num_sub_vectors: usize,
+    dimension: usize,
+    num_bits: usize,
+) -> Result<FixedSizeListArray> {
+    let pq_book_size = 1usize << num_bits;
+    let subvector_dim = dimension / num_sub_vectors;
+    let expected = [num_sub_vectors, subvector_dim, pq_book_size];
+    if shape != expected {
+        return Err(Error::io(format!(
+            "cuVS returned incompatible PQ codebook shape: expected {expected:?}, got {shape:?}"
+        )));
+    }
+
+    let mut flattened = Vec::with_capacity(values.len());
+    for subspace in 0..num_sub_vectors {
+        for centroid in 0..pq_book_size {
+            for component in 0..subvector_dim {
+                let source_idx = ((subspace * subvector_dim + component) * pq_book_size) + centroid;
+                flattened.push(values[source_idx]);
+            }
+        }
+    }
+
+    Ok(FixedSizeListArray::try_new_from_values(
+        Float32Array::from(flattened),
+        subvector_dim as i32,
+    )?)
+}
+
+fn build_metadata_batch(
+    ivf_centroids: &FixedSizeListArray,
+    pq_codebook: &FixedSizeListArray,
+) -> Result<RecordBatch> {
+    let ivf_offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0i32, ivf_centroids.len() as i32]));
+    let pq_offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0i32, pq_codebook.len() as i32]));
+    let ivf_list = ListArray::new(
+        Arc::new(Field::new(
+            "_ivf_centroids_item",
+            ivf_centroids.data_type().clone(),
+            false,
+        )),
+        ivf_offsets,
+        Arc::new(ivf_centroids.clone()),
+        None,
+    );
+    let pq_list = ListArray::new(
+        Arc::new(Field::new(
+            "_pq_codebook_item",
+            pq_codebook.data_type().clone(),
+            false,
+        )),
+        pq_offsets,
+        Arc::new(pq_codebook.clone()),
+        None,
+    );
+    let schema = Arc::new(ArrowSchema::new(vec![
+        Field::new("_ivf_centroids", ivf_list.data_type().clone(), false),
+        Field::new("_pq_codebook", pq_list.data_type().clone(), false),
+    ]));
+    Ok(RecordBatch::try_new(
+        schema,
+        vec![Arc::new(ivf_list), Arc::new(pq_list)],
+    )?)
+}
+
+fn metadata_writer_options() -> Result<FileWriterOptions> {
+    Ok(FileWriterOptions {
+        format_version: Some(
+            PARTITION_ARTIFACT_FILE_VERSION
+                .parse::<LanceFileVersion>()
+                .map_err(|error| {
+                    Error::invalid_input(format!(
+                        "invalid partition artifact file version '{}': {}",
+                        PARTITION_ARTIFACT_FILE_VERSION, error
+                    ))
+                })?,
+        ),
+        ..Default::default()
+    })
+}
+
+async fn write_partition_artifact_metadata(
+    artifact_uri: &str,
+    trained: &TrainedIvfPqIndex,
+) -> Result<()> {
+    let (object_store, root_dir) = lance::io::ObjectStore::from_uri(artifact_uri)
+        .await
+        .map_err(|error| Error::io(error.to_string()))?;
+    let path = root_dir.child(PARTITION_ARTIFACT_METADATA_FILE_NAME);
+    let batch = build_metadata_batch(&trained.ivf_centroids, &trained.pq_codebook)?;
+    let mut writer = FileWriter::try_new(
+        object_store.create(&path).await?,
+        lance_core::datatypes::Schema::try_from(batch.schema().as_ref())?,
+        metadata_writer_options()?,
+    )?;
+    writer.add_schema_metadata(
+        "lance:index_build:artifact_version".to_string(),
+        "1".to_string(),
+    );
+    writer.add_schema_metadata(
+        "lance:index_build:distance_type".to_string(),
+        trained.metric_type.to_string(),
+    );
+    writer.add_schema_metadata(
+        "lance:index_build:num_partitions".to_string(),
+        trained.num_partitions.to_string(),
+    );
+    writer.add_schema_metadata(
+        "lance:index_build:num_sub_vectors".to_string(),
+        trained.num_sub_vectors.to_string(),
+    );
+    writer.add_schema_metadata(
+        "lance:index_build:num_bits".to_string(),
+        trained.num_bits.to_string(),
+    );
+    writer.add_schema_metadata(
+        "lance:index_build:dimension".to_string(),
+        trained.dimension.to_string(),
+    );
+    writer.write_batch(&batch).await?;
+    writer.finish().await?;
+    Ok(())
+}
+
+fn build_partition_batch(
+    row_ids: Arc<dyn Array>,
+    partitions: &[u32],
+    pq_codes: &[u8],
+    code_width: usize,
+) -> Result<RecordBatch> {
+    if pq_codes.len() != partitions.len() * code_width {
+        return Err(Error::io(format!(
+            "partition artifact batch expects {} PQ codes for {} rows and code width {}, got {}",
+            partitions.len() * code_width,
+            partitions.len(),
+            code_width,
+            pq_codes.len()
+        )));
+    }
+    let schema = Arc::new(ArrowSchema::new(vec![
+        Field::new(ROW_ID, DataType::UInt64, false),
+        Field::new(PART_ID_COLUMN, DataType::UInt32, false),
+        Field::new(
+            PQ_CODE_COLUMN,
+            DataType::FixedSizeList(
+                Arc::new(Field::new("item", DataType::UInt8, true)),
+                code_width as i32,
+            ),
+            true,
+        ),
+    ]));
+    let pq_codes = FixedSizeListArray::try_new_from_values(
+        UInt8Array::from_iter_values(pq_codes.iter().copied()),
+        code_width as i32,
+    )?;
+    Ok(RecordBatch::try_new(
+        schema,
+        vec![
+            row_ids,
+            Arc::new(UInt32Array::from_iter_values(partitions.iter().copied())),
+            Arc::new(pq_codes),
+        ],
+    )?)
+}
+
+fn transform_batch_loss(batch: &RecordBatch) -> f64 {
+    batch
+        .metadata()
+        .get(LOSS_METADATA_KEY)
+        .and_then(|value| value.parse::<f64>().ok())
+        .unwrap_or(0.0)
+}
+
+struct TransformSlot {
+    input_host: PinnedHostBuffer<f32>,
+    input_device: DeviceTensor<f32>,
+    labels_host: PinnedHostBuffer<u32>,
+    labels_device: DeviceTensor<u32>,
+    codes_host: PinnedHostBuffer<u8>,
+    codes_device: DeviceTensor<u8>,
+    h2d_start: CudaEvent,
+    h2d_done: CudaEvent,
+    transform_done: CudaEvent,
+    output_ready: CudaEvent,
+    row_ids: Option<Arc<dyn Array>>,
+    rows: usize,
+}
+
+impl TransformSlot {
+    fn try_new(
+        resources: &Resources,
+        max_rows: usize,
+        dimension: usize,
+        code_width: usize,
+    ) -> Result<Self> {
+        Ok(Self {
+            input_host: PinnedHostBuffer::try_new(max_rows * dimension)?,
+            input_device: DeviceTensor::try_new(resources, &[max_rows, dimension])?,
+            labels_host: PinnedHostBuffer::try_new(max_rows)?,
+            labels_device: DeviceTensor::try_new(resources, &[max_rows])?,
+            codes_host: PinnedHostBuffer::try_new(max_rows * code_width)?,
+            codes_device: DeviceTensor::try_new(resources, &[max_rows, code_width])?,
+            h2d_start: CudaEvent::try_new()?,
+            h2d_done: CudaEvent::try_new()?,
+            transform_done: CudaEvent::try_new()?,
+            output_ready: CudaEvent::try_new()?,
+            row_ids: None,
+            rows: 0,
+        })
+    }
+
+    fn has_pending_output(&self) -> bool {
+        self.row_ids.is_some()
+    }
+
+    fn launch(
+        &mut self,
+        trained: &TrainedIvfPqIndex,
+        stream: cuvs_sys::cudaStream_t,
+        row_ids: Arc<dyn Array>,
+        matrix: &[f32],
+        rows: usize,
+        dimension: usize,
+    ) -> Result<()> {
+        let code_width = trained.pq_code_width();
+        self.input_host.copy_from_slice(matrix)?;
+        self.input_device.set_shape(&[rows, dimension])?;
+        self.labels_device.set_shape(&[rows])?;
+        self.codes_device.set_shape(&[rows, code_width])?;
+        self.rows = rows;
+        self.row_ids = Some(row_ids);
+
+        self.h2d_start.record(stream)?;
+        self.input_device.copy_from_host_async(
+            &trained.resources,
+            self.input_host.prefix(rows * dimension)?,
+        )?;
+        self.h2d_done.record(stream)?;
+        check_cuvs(
+            unsafe {
+                cuvs_sys::cuvsIvfPqTransform(
+                    trained.resources.0,
+                    trained.index.raw,
+                    self.input_device.as_mut_ptr(),
+                    self.labels_device.as_mut_ptr(),
+                    self.codes_device.as_mut_ptr(),
+                )
+            },
+            "transform vectors with IVF_PQ",
+        )?;
+        self.transform_done.record(stream)?;
+        self.labels_device
+            .copy_to_host_async(&trained.resources, self.labels_host.prefix_mut(rows)?)?;
+        self.codes_device.copy_to_host_async(
+            &trained.resources,
+            self.codes_host.prefix_mut(rows * code_width)?,
+        )?;
+        self.output_ready.record(stream)?;
+        Ok(())
+    }
+
+    fn drain_to_batch(&mut self, code_width: usize) -> Result<Option<RecordBatch>> {
+        if !self.has_pending_output() {
+            return Ok(None);
+        }
+
+        self.output_ready.synchronize()?;
+        let row_ids = self
+            .row_ids
+            .take()
+            .ok_or_else(|| Error::io("transform slot is missing row ids"))?;
+        let batch = build_partition_batch(
+            row_ids,
+            self.labels_host.prefix(self.rows)?,
+            self.codes_host.prefix(self.rows * code_width)?,
+            code_width,
+        )?;
+        self.rows = 0;
+        Ok(Some(batch))
+    }
+}
+
+async fn for_each_transformed_batch<F, Fut>(
+    dataset: &Dataset,
+    column: &str,
+    trained: &TrainedIvfPqIndex,
+    batch_size: usize,
+    filter_nan: bool,
+    mut on_batch: F,
+) -> Result<()>
+where
+    F: FnMut(RecordBatch) -> Fut,
+    Fut: std::future::Future<Output = Result<()>>,
+{
+    let code_width = trained.pq_code_width();
+    let mut scanner = dataset.scan();
+    scanner.project(&[column])?;
+    if dataset
+        .schema()
+        .field(column)
+        .is_some_and(|field| field.nullable && filter_nan)
+    {
+        scanner.filter(&format!("{column} is not null"))?;
+    }
+    scanner.with_row_id();
+    scanner.batch_size(batch_size);
+    let mut stream = scanner.try_into_stream().await?;
+    let cuda_stream = trained
+        .resources
+        .get_cuda_stream()
+        .map_err(|error| Error::io(error.to_string()))?;
+    let mut slots = (0..PIPELINE_SLOTS)
+        .map(|_| {
+            TransformSlot::try_new(
+                &trained.resources,
+                batch_size,
+                trained.dimension,
+                code_width,
+            )
+        })
+        .collect::<Result<Vec<_>>>()?;
+    let mut next_slot = 0usize;
+
+    loop {
+        let Some(batch) = stream.try_next().await? else {
+            break;
+        };
+        let slot = &mut slots[next_slot];
+        if let Some(transformed) = slot.drain_to_batch(code_width)? {
+            on_batch(transformed).await?;
+        }
+
+        let vectors = vector_column_to_fsl(&batch, column)?;
+        let row_ids = batch
+            .column_by_name(ROW_ID)
+            .ok_or_else(|| Error::invalid_input(format!("transform batch is missing {ROW_ID}")))?;
+        let finite_mask = is_finite(&vectors);
+        let valid_rows = finite_mask.true_count();
+        if valid_rows == 0 {
+            continue;
+        }
+        if valid_rows != vectors.len() {
+            warn!(
+                "{} vectors are ignored during partition assignment because they are null or non-finite",
+                vectors.len() - valid_rows
+            );
+        }
+
+        let filtered_row_ids = if valid_rows == row_ids.len() {
+            row_ids.clone()
+        } else {
+            filter(row_ids.as_ref(), &finite_mask)?
+        };
+        let filtered_vectors = if valid_rows == vectors.len() {
+            vectors
+        } else {
+            let vector_column = batch.column_by_name(column).ok_or_else(|| {
+                Error::invalid_input(format!(
+                    "transform batch is missing vector column '{column}'"
+                ))
+            })?;
+            let field = batch
+                .schema()
+                .field_with_name(column)
+                .map_err(|_| {
+                    Error::invalid_input(format!(
+                        "transform batch schema is missing field '{column}'"
+                    ))
+                })?
+                .clone();
+            let filtered_vectors = filter(vector_column.as_ref(), &finite_mask)?;
+            vector_column_to_fsl(
+                &RecordBatch::try_new(
+                    Arc::new(ArrowSchema::new(vec![field])),
+                    vec![filtered_vectors],
+                )?,
+                column,
+            )?
+        };
+
+        let matrix = matrix_from_vectors(&filtered_vectors)?;
+        let matrix_view = matrix.view()?;
+        let input_slice = matrix_view
+            .as_slice_memory_order()
+            .ok_or_else(|| Error::io("transform matrix is not contiguous"))?;
+
+        slot.launch(
+            trained,
+            cuda_stream,
+            filtered_row_ids,
+            input_slice,
+            matrix.rows(),
+            matrix_view.ncols(),
+        )?;
+        next_slot = (next_slot + 1) % PIPELINE_SLOTS;
+    }
+
+    for slot in &mut slots {
+        if let Some(transformed) = slot.drain_to_batch(code_width)? {
+            on_batch(transformed).await?;
+        }
+    }
+    Ok(())
+}
+
+pub async fn train_ivf_pq(
+    dataset: &Dataset,
+    column: &str,
+    num_partitions: usize,
+    metric_type: DistanceType,
+    num_sub_vectors: usize,
+    sample_rate: usize,
+    max_iters: usize,
+    num_bits: usize,
+    filter_nan: bool,
+) -> Result<TrainedIvfPqIndex> {
+    if num_bits != 8 {
+        return Err(Error::not_supported(
+            "cuVS IVF_PQ currently supports only num_bits=8",
+        ));
+    }
+
+    let dimension = infer_dimension(dataset, column)?;
+    if dimension % num_sub_vectors != 0 {
+        return Err(Error::invalid_input(format!(
+            "cuVS IVF_PQ requires vector dimension {} to be divisible by num_sub_vectors {}",
+            dimension, num_sub_vectors
+        )));
+    }
+
+    let num_rows = dataset.count_rows(None).await?;
+    if num_rows == 0 {
+        return Err(Error::invalid_input(
+            "cuVS training requires at least one training vector",
+        ));
+    }
+    let train_rows = num_rows
+        .min((num_partitions * sample_rate).max(256 * 256))
+        .max(1);
+    let train_vectors = if filter_nan {
+        let batch = dataset.scan().project(&[column])?.try_into_batch().await?;
+        let vectors = vector_column_to_fsl(&batch, column)?;
+        let mask = is_finite(&vectors);
+        let filtered = filter(&vectors, &mask)?.as_fixed_size_list().clone();
+        filtered.slice(0, train_rows.min(filtered.len()))
+    } else {
+        let projection = dataset.schema().project(&[column])?;
+        let batch = dataset.sample(train_rows, &projection, None).await?;
+        vector_column_to_fsl(&batch, column)?
+    };
+    if train_vectors.is_empty() {
+        return Err(Error::invalid_input(
+            "cuVS training requires at least one non-null training vector",
+        ));
+    }
+
+    let matrix = matrix_from_vectors(&train_vectors)?;
+    let resources = Resources::new().map_err(|error| Error::io(error.to_string()))?;
+    let index = CuvsIvfPqIndex::try_new()?;
+    let params = create_index_params(
+        metric_type,
+        num_partitions,
+        num_sub_vectors,
+        sample_rate,
+        max_iters,
+        num_bits,
+    )?;
+    let matrix_view = matrix.view()?;
+    let mut dataset_tensor = HostTensorView::try_new::<f32>(
+        &[matrix_view.nrows(), matrix_view.ncols()],
+        matrix_view.as_ptr() as *mut std::ffi::c_void,
+    );
+
+    let build_result = check_cuvs(
+        unsafe {
+            cuvs_sys::cuvsIvfPqBuild(resources.0, params, dataset_tensor.as_mut_ptr(), index.raw)
+        },
+        "build IVF_PQ index",
+    );
+    destroy_index_params(params);
+    build_result?;
+
+    let mut centers = make_tensor_view();
+    check_cuvs(
+        unsafe { cuvs_sys::cuvsIvfPqIndexGetCenters(index.raw, centers.as_mut_ptr()) },
+        "get IVF centroids",
+    )?;
+    let ivf_centroids =
+        ivf_centroids_from_host(copy_tensor_to_host_f32_2d(&resources, &centers.tensor)?)?;
+
+    let mut pq_centers = make_tensor_view();
+    check_cuvs(
+        unsafe { cuvs_sys::cuvsIvfPqIndexGetPqCenters(index.raw, pq_centers.as_mut_ptr()) },
+        "get PQ codebook",
+    )?;
+    let (pq_codebook_values, pq_codebook_shape) =
+        copy_tensor_to_host_f32_3d(&resources, &pq_centers.tensor)?;
+    let pq_codebook = pq_codebook_from_host(
+        pq_codebook_values,
+        pq_codebook_shape,
+        num_sub_vectors,
+        dimension,
+        num_bits,
+    )?;
+
+    Ok(TrainedIvfPqIndex {
+        resources,
+        index,
+        num_partitions,
+        dimension,
+        num_sub_vectors,
+        num_bits,
+        metric_type,
+        ivf_centroids,
+        pq_codebook,
+    })
+}
+
+pub async fn assign_ivf_pq_to_artifact(
+    dataset: &Dataset,
+    column: &str,
+    trained: &TrainedIvfPqIndex,
+    artifact_uri: &str,
+    batch_size: usize,
+    filter_nan: bool,
+) -> Result<Vec<String>> {
+    let code_width = trained.pq_code_width();
+    let builder = Arc::new(tokio::sync::Mutex::new(
+        PartitionArtifactBuilder::try_new(artifact_uri, trained.num_partitions, code_width, None)
+            .await?,
+    ));
+    for_each_transformed_batch(dataset, column, trained, batch_size, filter_nan, |batch| {
+        let builder = builder.clone();
+        async move {
+            builder.lock().await.append_batch(&batch).await?;
+            Ok(())
+        }
+    })
+    .await?;
+    let mut builder = Arc::try_unwrap(builder)
+        .map_err(|_| Error::io("partition artifact builder still has outstanding references"))?
+        .into_inner();
+
+    write_partition_artifact_metadata(artifact_uri, trained).await?;
+    let mut files = builder.finish(PARTITION_ARTIFACT_METADATA_FILE_NAME, None).await?;
+    if files.len() > 1 {
+        files.insert(1, PARTITION_ARTIFACT_METADATA_FILE_NAME.to_string());
+    }
+    Ok(files)
+}
diff --git a/rust/lance-index/src/vector/ivf/builder.rs b/rust/lance-index/src/vector/ivf/builder.rs
index 9154adbcd80..155b33f58b9 100644
--- a/rust/lance-index/src/vector/ivf/builder.rs
+++ b/rust/lance-index/src/vector/ivf/builder.rs
@@ -53,6 +53,10 @@ pub struct IvfBuildParams {
     /// Requires `centroids` to be set.
     pub precomputed_encoded_dataset_uri: Option<String>,
 
+    /// Precomputed partitioned artifact produced by an external backend.
+    /// Mutually exclusive with other precomputed inputs and requires `centroids` to be set.
+    pub precomputed_partition_artifact_uri: Option<String>,
+
     pub shuffle_partition_batches: usize,
 
     pub shuffle_partition_concurrency: usize,
@@ -73,6 +77,7 @@ impl Default for IvfBuildParams {
             precomputed_partitions_file: None,
             precomputed_shuffle_buffers: None,
             precomputed_encoded_dataset_uri: None,
+            precomputed_partition_artifact_uri: None,
             shuffle_partition_batches: 1024 * 10,
             shuffle_partition_concurrency: 2,
             storage_options: None,
diff --git a/rust/lance/src/index/vector.rs b/rust/lance/src/index/vector.rs
index 13176c3bca8..48235a65582 100644
--- a/rust/lance/src/index/vector.rs
+++ b/rust/lance/src/index/vector.rs
@@ -10,6 +10,7 @@ use std::{any::Any, collections::HashMap};
 pub mod builder;
 mod encoded_dataset;
 pub mod ivf;
+mod partition_artifact;
 pub mod pq;
 pub mod utils;
 
@@ -32,6 +33,7 @@ use lance_index::vector::hnsw::HNSW;
 use lance_index::vector::ivf::builder::recommended_num_partitions;
 use lance_index::vector::ivf::storage::IvfModel;
 use object_store::path::Path;
+pub use partition_artifact::PartitionArtifactBuilder;
 
 use lance_arrow::FixedSizeListArrayExt;
 use lance_index::vector::pq::ProductQuantizer;
@@ -1657,6 +1659,7 @@ fn derive_ivf_params(ivf_model: &IvfModel) -> IvfBuildParams {
         precomputed_partitions_file: None,
         precomputed_shuffle_buffers: None,
         precomputed_encoded_dataset_uri: None,
+        precomputed_partition_artifact_uri: None,
         shuffle_partition_batches: 1024 * 10, // Default
         shuffle_partition_concurrency: 2,     // Default
         storage_options: None,
diff --git a/rust/lance/src/index/vector/builder.rs b/rust/lance/src/index/vector/builder.rs
index 258e978f1ac..9c9d4b16eed 100644
--- a/rust/lance/src/index/vector/builder.rs
+++ b/rust/lance/src/index/vector/builder.rs
@@ -94,6 +94,7 @@ use super::v2::IVFIndex;
 use super::{
     encoded_dataset::EncodedDatasetShuffleReader,
     ivf::load_precomputed_partitions_if_available,
+    partition_artifact::PartitionArtifactShuffleReader,
     utils::{self, get_vector_type},
 };
 
@@ -240,6 +241,19 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q>
         ))
     }
 
+    async fn try_open_precomputed_partition_artifact_reader(
+        &self,
+        uri: &str,
+    ) -> Result<Arc<dyn ShuffleReader>> {
+        let storage_options = self
+            .ivf_params
+            .as_ref()
+            .and_then(|params| params.storage_options.as_ref());
+        Ok(Arc::new(
+            PartitionArtifactShuffleReader::try_open(uri, storage_options).await?,
+        ))
+    }
+
     #[allow(clippy::too_many_arguments)]
     pub fn new(
         dataset: Dataset,
@@ -622,6 +636,19 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q>
             return Err(Error::invalid_input("dataset not set before shuffling"));
         };
 
+        if let Some(uri) = self
+            .ivf_params
+            .as_ref()
+            .and_then(|params| params.precomputed_partition_artifact_uri.as_deref())
+        {
+            log::info!("shuffle with precomputed partition artifact from {}", uri);
+            self.shuffle_reader = Some(
+                self.try_open_precomputed_partition_artifact_reader(uri)
+                    .await?,
+            );
+            return Ok(());
+        }
+
         if let Some(uri) = self
             .ivf_params
             .as_ref()
diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs
index 1f2b47887db..4841c98d661 100644
--- a/rust/lance/src/index/vector/ivf.rs
+++ b/rust/lance/src/index/vector/ivf.rs
@@ -1210,6 +1210,12 @@ fn sanity_check_ivf_params(ivf: &IvfBuildParams) -> Result<()> {
         ));
     }
 
+    if ivf.precomputed_partition_artifact_uri.is_some() && ivf.centroids.is_none() {
+        return Err(Error::index(
+            "precomputed_partition_artifact_uri requires centroids to be set".to_string(),
+        ));
+    }
+
     if ivf.precomputed_shuffle_buffers.is_some() && ivf.precomputed_partitions_file.is_some() {
         return Err(Error::index(
             "precomputed_shuffle_buffers and precomputed_partitions_file are mutually exclusive"
@@ -1231,6 +1237,31 @@ fn sanity_check_ivf_params(ivf: &IvfBuildParams) -> Result<()> {
         ));
     }
 
+    if ivf.precomputed_partition_artifact_uri.is_some() && ivf.precomputed_partitions_file.is_some()
+    {
+        return Err(Error::index(
+            "precomputed_partition_artifact_uri and precomputed_partitions_file are mutually exclusive"
+                .to_string(),
+        ));
+    }
+
+    if ivf.precomputed_partition_artifact_uri.is_some() && ivf.precomputed_shuffle_buffers.is_some()
+    {
+        return Err(Error::index(
+            "precomputed_partition_artifact_uri and precomputed_shuffle_buffers are mutually exclusive"
+                .to_string(),
+        ));
+    }
+
+    if ivf.precomputed_partition_artifact_uri.is_some()
+        && ivf.precomputed_encoded_dataset_uri.is_some()
+    {
+        return Err(Error::index(
+            "precomputed_partition_artifact_uri and precomputed_encoded_dataset_uri are mutually exclusive"
+                .to_string(),
+        ));
+    }
+
     Ok(())
 }
 
@@ -1248,6 +1279,12 @@ fn sanity_check_params(ivf: &IvfBuildParams, pq: &PQBuildParams) -> Result<()> {
         ));
     }
 
+    if ivf.precomputed_partition_artifact_uri.is_some() && pq.codebook.is_none() {
+        return Err(Error::index(
+            "precomputed_partition_artifact_uri requires codebooks to be set".to_string(),
+        ));
+    }
+
     Ok(())
 }
 
diff --git a/rust/lance/src/index/vector/partition_artifact.rs b/rust/lance/src/index/vector/partition_artifact.rs
new file mode 100644
index 00000000000..cfd2a0f2b4a
--- /dev/null
+++ b/rust/lance/src/index/vector/partition_artifact.rs
@@ -0,0 +1,956 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+use std::collections::HashMap;
+use std::mem;
+use std::ops::Range;
+use std::sync::{Arc, Mutex};
+
+use arrow_array::cast::AsArray;
+use arrow_array::{FixedSizeListArray, RecordBatch, UInt8Array, UInt32Array, UInt64Array};
+use arrow_schema::{DataType, Field, Schema as ArrowSchema};
+use futures::TryStreamExt;
+use lance_arrow::FixedSizeListArrayExt;
+use lance_core::cache::LanceCache;
+use lance_core::datatypes::Schema;
+use lance_core::{Error, ROW_ID, Result};
+use lance_encoding::decoder::{DecoderPlugins, FilterExpression};
+use lance_file::reader::{FileReader, FileReaderOptions};
+use lance_file::version::LanceFileVersion;
+use lance_file::writer::{FileWriter, FileWriterOptions};
+use lance_index::vector::v3::shuffler::ShuffleReader;
+use lance_index::vector::{PART_ID_COLUMN, PQ_CODE_COLUMN};
+use lance_io::ReadBatchParams;
+use lance_io::object_store::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry};
+use lance_io::scheduler::{ScanScheduler, SchedulerConfig};
+use lance_io::stream::{RecordBatchStream, RecordBatchStreamAdapter};
+use lance_io::traits::Writer;
+use lance_io::utils::CachedFileSize;
+use object_store::path::Path;
+use serde::{Deserialize, Serialize};
+use tokio::io::AsyncWriteExt;
+
+const PARTITION_ARTIFACT_MANIFEST_VERSION: u32 = 1;
+const PARTITION_ARTIFACT_MANIFEST_FILE_NAME: &str = "manifest.json";
+const PARTITION_ARTIFACT_PARTITIONS_DIR: &str = "partitions";
+const PARTITION_ARTIFACT_DEFAULT_BUCKETS: usize = 256;
+const PARTITION_ARTIFACT_STAGING_PREFIX: &str = ".staging-bucket-";
+const PARTITION_ARTIFACT_BUCKET_PREFIX: &str = "bucket-";
+const PARTITION_ARTIFACT_FILE_VERSION: &str = "2.2";
+const PARTITION_ARTIFACT_BUCKET_BUFFER_ROWS: usize = 32 * 1024;
+
+#[derive(Debug, Serialize, Deserialize)]
+struct PartitionArtifactManifest {
+    version: u32,
+    num_partitions: usize,
+    #[serde(default)]
+    metadata_file: Option<String>,
+    #[serde(default)]
+    total_loss: Option<f64>,
+    partitions: Vec<PartitionArtifactPartition>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+struct PartitionArtifactPartition {
+    #[serde(default)]
+    path: Option<String>,
+    #[serde(default)]
+    num_rows: usize,
+    #[serde(default)]
+    ranges: Vec<PartitionArtifactRange>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+struct PartitionArtifactRange {
+    offset: u64,
+    num_rows: u64,
+}
+
+#[derive(Default, Debug)]
+struct BucketBuffer {
+    row_ids: Vec<u64>,
+    partition_ids: Vec<u32>,
+    pq_values: Vec<u8>,
+}
+
+impl BucketBuffer {
+    fn len(&self) -> usize {
+        self.row_ids.len()
+    }
+
+    fn is_empty(&self) -> bool {
+        self.row_ids.is_empty()
+    }
+}
+
+pub struct PartitionArtifactBuilder {
+    object_store: Arc<ObjectStore>,
+    root_dir: Path,
+    num_partitions: usize,
+    num_buckets: usize,
+    pq_code_width: usize,
+    temp_schema: Arc<ArrowSchema>,
+    final_schema: Arc<ArrowSchema>,
+    temp_writers: Vec<Option<FileWriter>>,
+    buffers: Vec<BucketBuffer>,
+}
+
+impl PartitionArtifactBuilder {
+    pub async fn try_new(
+        uri: &str,
+        num_partitions: usize,
+        pq_code_width: usize,
+        storage_options: Option<&HashMap<String, String>>,
+    ) -> Result<Self> {
+        let registry = Arc::new(ObjectStoreRegistry::default());
+        let params = if let Some(storage_options) = storage_options {
+            ObjectStoreParams {
+                storage_options_accessor: Some(Arc::new(
+                    lance_io::object_store::StorageOptionsAccessor::with_static_options(
+                        storage_options.clone(),
+                    ),
+                )),
+                ..Default::default()
+            }
+        } else {
+            ObjectStoreParams::default()
+        };
+        let (object_store, root_dir) =
+            ObjectStore::from_uri_and_params(registry, uri, &params).await?;
+        Self::try_new_with_store(object_store, root_dir, num_partitions, pq_code_width)
+    }
+
+    pub fn try_new_with_store(
+        object_store: Arc<ObjectStore>,
+        root_dir: Path,
+        num_partitions: usize,
+        pq_code_width: usize,
+    ) -> Result<Self> {
+        if num_partitions == 0 {
+            return Err(Error::invalid_input(
+                "partition artifact builder requires num_partitions > 0".to_string(),
+            ));
+        }
+        if pq_code_width == 0 {
+            return Err(Error::invalid_input(
+                "partition artifact builder requires pq_code_width > 0".to_string(),
+            ));
+        }
+
+        let num_buckets = num_partitions
+            .min(PARTITION_ARTIFACT_DEFAULT_BUCKETS)
+            .max(1);
+        let temp_schema = Arc::new(ArrowSchema::new(vec![
+            Field::new(ROW_ID, DataType::UInt64, false),
+            Field::new(PART_ID_COLUMN, DataType::UInt32, false),
+            Field::new(
+                PQ_CODE_COLUMN,
+                DataType::FixedSizeList(
+                    Arc::new(Field::new("item", DataType::UInt8, true)),
+                    pq_code_width as i32,
+                ),
+                true,
+            ),
+        ]));
+        let final_schema = Arc::new(ArrowSchema::new(vec![
+            Field::new(ROW_ID, DataType::UInt64, false),
+            Field::new(
+                PQ_CODE_COLUMN,
+                DataType::FixedSizeList(
+                    Arc::new(Field::new("item", DataType::UInt8, true)),
+                    pq_code_width as i32,
+                ),
+                true,
+            ),
+        ]));
+
+        Ok(Self {
+            object_store,
+            root_dir,
+            num_partitions,
+            num_buckets,
+            pq_code_width,
+            temp_schema,
+            final_schema,
+            temp_writers: (0..num_buckets).map(|_| None).collect(),
+            buffers: (0..num_buckets).map(|_| BucketBuffer::default()).collect(),
+        })
+    }
+
+    pub async fn append_batch(&mut self, batch: &RecordBatch) -> Result<()> {
+        validate_input_batch(batch, self.pq_code_width)?;
+
+        let row_ids = batch[ROW_ID].as_primitive::<arrow::datatypes::UInt64Type>();
+        let part_ids = batch[PART_ID_COLUMN].as_primitive::<arrow::datatypes::UInt32Type>();
+        let pq_codes = batch[PQ_CODE_COLUMN].as_fixed_size_list();
+        let pq_values = pq_codes
+            .values()
+            .as_primitive::<arrow::datatypes::UInt8Type>();
+        let pq_values = pq_values.values().as_ref();
+
+        for row_idx in 0..batch.num_rows() {
+            let partition_id = part_ids.value(row_idx) as usize;
+            if partition_id >= self.num_partitions {
+                return Err(Error::invalid_input(format!(
+                    "partition artifact batch contains partition id {} but num_partitions is {}",
+                    partition_id, self.num_partitions
+                )));
+            }
+            let bucket_id = partition_id % self.num_buckets;
+            let buffer = &mut self.buffers[bucket_id];
+            buffer.row_ids.push(row_ids.value(row_idx));
+            buffer.partition_ids.push(partition_id as u32);
+            let start = row_idx * self.pq_code_width;
+            let end = start + self.pq_code_width;
+            buffer.pq_values.extend_from_slice(&pq_values[start..end]);
+            if buffer.len() >= PARTITION_ARTIFACT_BUCKET_BUFFER_ROWS {
+                self.flush_bucket(bucket_id).await?;
+            }
+        }
+        Ok(())
+    }
+
+    pub async fn finish(
+        &mut self,
+        metadata_file: &str,
+        total_loss: Option<f64>,
+    ) -> Result<Vec<String>> {
+        for bucket_id in 0..self.num_buckets {
+            self.flush_bucket(bucket_id).await?;
+        }
+        for writer in self.temp_writers.iter_mut() {
+            if let Some(writer) = writer.as_mut() {
+                writer.finish().await?;
+            }
+        }
+
+        let mut partitions = vec![
+            PartitionArtifactPartition {
+                path: None,
+                num_rows: 0,
+                ranges: Vec::new(),
+            };
+            self.num_partitions
+        ];
+        let mut artifact_files = Vec::with_capacity(self.num_buckets + 1);
+
+        for bucket_id in 0..self.num_buckets {
+            if let Some(relative_path) = self.finalize_bucket(bucket_id, &mut partitions).await? {
+                artifact_files.push(relative_path);
+            }
+        }
+
+        let manifest = PartitionArtifactManifest {
+            version: PARTITION_ARTIFACT_MANIFEST_VERSION,
+            num_partitions: self.num_partitions,
+            metadata_file: Some(metadata_file.to_string()),
+            total_loss,
+            partitions,
+        };
+        write_json(
+            self.object_store.as_ref(),
+            &self.root_dir.child(PARTITION_ARTIFACT_MANIFEST_FILE_NAME),
+            &manifest,
+        )
+        .await?;
+
+        let mut files = vec![PARTITION_ARTIFACT_MANIFEST_FILE_NAME.to_string()];
+        files.extend(artifact_files);
+        Ok(files)
+    }
+
+    async fn flush_bucket(&mut self, bucket_id: usize) -> Result<()> {
+        if self.buffers[bucket_id].is_empty() {
+            return Ok(());
+        }
+
+        let batch = self.take_temp_batch(bucket_id)?;
+        let writer = self.ensure_temp_writer(bucket_id).await?;
+        writer.write_batch(&batch).await?;
+        Ok(())
+    }
+
+    fn take_temp_batch(&mut self, bucket_id: usize) -> Result<RecordBatch> {
+        let buffer = &mut self.buffers[bucket_id];
+        let row_ids = UInt64Array::from(mem::take(&mut buffer.row_ids));
+        let part_ids = UInt32Array::from(mem::take(&mut buffer.partition_ids));
+        let pq_values = UInt8Array::from(mem::take(&mut buffer.pq_values));
+        let pq_codes =
+            FixedSizeListArray::try_new_from_values(pq_values, self.pq_code_width as i32)?;
+        RecordBatch::try_new(
+            self.temp_schema.clone(),
+            vec![Arc::new(row_ids), Arc::new(part_ids), Arc::new(pq_codes)],
+        )
+        .map_err(Error::from)
+    }
+
+    async fn ensure_temp_writer(&mut self, bucket_id: usize) -> Result<&mut FileWriter> {
+        if self.temp_writers[bucket_id].is_none() {
+            let path = self.temp_bucket_path(bucket_id);
+            let writer = FileWriter::try_new(
+                self.object_store.create(&path).await?,
+                Schema::try_from(self.temp_schema.as_ref())?,
+                file_writer_options()?,
+            )?;
+            self.temp_writers[bucket_id] = Some(writer);
+        }
+        Ok(self.temp_writers[bucket_id]
+            .as_mut()
+            .expect("temp writer initialized"))
+    }
+
+    async fn finalize_bucket(
+        &self,
+        bucket_id: usize,
+        partitions: &mut [PartitionArtifactPartition],
+    ) -> Result<Option<String>> {
+        let temp_path = self.temp_bucket_path(bucket_id);
+        if !self.object_store.exists(&temp_path).await? {
+            return Ok(None);
+        }
+
+        let reader = FileReader::try_open(
+            ScanScheduler::new(
+                self.object_store.clone(),
+                SchedulerConfig::max_bandwidth(&self.object_store),
+            )
+            .open_file(&temp_path, &CachedFileSize::unknown())
+            .await?,
+            None,
+            Arc::<DecoderPlugins>::default(),
+            &LanceCache::no_cache(),
+            FileReaderOptions::default(),
+        )
+        .await?;
+
+        let batches = reader
+            .read_stream(
+                ReadBatchParams::RangeFull,
+                u32::MAX,
+                16,
+                FilterExpression::no_filter(),
+            )?
+            .try_collect::<Vec<_>>()
+            .await?;
+        let total_rows = batches.iter().map(|batch| batch.num_rows()).sum::<usize>();
+        if total_rows == 0 {
+            self.object_store.delete(&temp_path).await?;
+            return Ok(None);
+        }
+
+        let mut row_ids = Vec::with_capacity(total_rows);
+        let mut partition_ids = Vec::with_capacity(total_rows);
+        let mut pq_values = Vec::with_capacity(total_rows * self.pq_code_width);
+        for batch in batches {
+            let batch_row_ids = batch[ROW_ID].as_primitive::<arrow::datatypes::UInt64Type>();
+            let batch_partition_ids =
+                batch[PART_ID_COLUMN].as_primitive::<arrow::datatypes::UInt32Type>();
+            let batch_pq = batch[PQ_CODE_COLUMN].as_fixed_size_list();
+            let batch_pq_values = batch_pq
+                .values()
+                .as_primitive::<arrow::datatypes::UInt8Type>();
+            row_ids.extend(batch_row_ids.values().iter().copied());
+            partition_ids.extend(batch_partition_ids.values().iter().copied());
+            pq_values.extend_from_slice(batch_pq_values.values().as_ref());
+        }
+
+        let mut permutation = (0..total_rows).collect::<Vec<_>>();
+        permutation.sort_unstable_by_key(|&idx| partition_ids[idx]);
+
+        let mut sorted_row_ids = Vec::with_capacity(total_rows);
+        let mut sorted_partition_ids = Vec::with_capacity(total_rows);
+        let mut sorted_pq_values = Vec::with_capacity(total_rows * self.pq_code_width);
+        for idx in permutation {
+            sorted_row_ids.push(row_ids[idx]);
+            sorted_partition_ids.push(partition_ids[idx]);
+            let start = idx * self.pq_code_width;
+            let end = start + self.pq_code_width;
+            sorted_pq_values.extend_from_slice(&pq_values[start..end]);
+        }
+
+        let final_path = self.final_bucket_path(bucket_id);
+        let final_relative_path = self.final_bucket_relative_path(bucket_id);
+        let mut writer = FileWriter::try_new(
+            self.object_store.create(&final_path).await?,
+            Schema::try_from(self.final_schema.as_ref())?,
+            file_writer_options()?,
+        )?;
+        let final_batch = RecordBatch::try_new(
+            self.final_schema.clone(),
+            vec![
+                Arc::new(UInt64Array::from(sorted_row_ids)),
+                Arc::new(FixedSizeListArray::try_new_from_values(
+                    UInt8Array::from(sorted_pq_values),
+                    self.pq_code_width as i32,
+                )?),
+            ],
+        )?;
+        writer.write_batch(&final_batch).await?;
+        writer.finish().await?;
+
+        let mut offset = 0usize;
+        while offset < sorted_partition_ids.len() {
+            let partition_id = sorted_partition_ids[offset] as usize;
+            let mut end = offset + 1;
+            while end < sorted_partition_ids.len()
+                && sorted_partition_ids[end] == sorted_partition_ids[offset]
+            {
+                end += 1;
+            }
+            partitions[partition_id] = PartitionArtifactPartition {
+                path: Some(final_relative_path.clone()),
+                num_rows: end - offset,
+                ranges: vec![PartitionArtifactRange {
+                    offset: offset as u64,
+                    num_rows: (end - offset) as u64,
+                }],
+            };
+            offset = end;
+        }
+
+        self.object_store.delete(&temp_path).await?;
+        Ok(Some(final_relative_path))
+    }
+
+    fn temp_bucket_path(&self, bucket_id: usize) -> Path {
+        self.root_dir
+            .child(PARTITION_ARTIFACT_PARTITIONS_DIR)
+            .child(format!(
+                "{PARTITION_ARTIFACT_STAGING_PREFIX}{bucket_id:05}.lance"
+            ))
+    }
+
+    fn final_bucket_path(&self, bucket_id: usize) -> Path {
+        self.root_dir
+            .child(PARTITION_ARTIFACT_PARTITIONS_DIR)
+            .child(format!(
+                "{PARTITION_ARTIFACT_BUCKET_PREFIX}{bucket_id:05}.lance"
+            ))
+    }
+
+    fn final_bucket_relative_path(&self, bucket_id: usize) -> String {
+        format!(
+            "{PARTITION_ARTIFACT_PARTITIONS_DIR}/{PARTITION_ARTIFACT_BUCKET_PREFIX}{bucket_id:05}.lance"
+        )
+    }
+}
+
+#[derive(Debug)]
+pub(crate) struct PartitionArtifactShuffleReader {
+    scheduler: Arc<ScanScheduler>,
+    root_dir: Path,
+    partitions: Vec<PartitionArtifactPartition>,
+    total_loss: Option<f64>,
+    file_readers: Mutex<HashMap<String, Arc<FileReader>>>,
+}
+
+fn file_writer_options() -> Result<FileWriterOptions> {
+    Ok(FileWriterOptions {
+        format_version: Some(
+            PARTITION_ARTIFACT_FILE_VERSION
+                .parse::<LanceFileVersion>()
+                .map_err(|error| {
+                    Error::invalid_input(format!(
+                        "invalid partition artifact file version '{}': {}",
+                        PARTITION_ARTIFACT_FILE_VERSION, error
+                    ))
+                })?,
+        ),
+        ..Default::default()
+    })
+}
+
+fn validate_input_batch(batch: &RecordBatch, pq_code_width: usize) -> Result<()> {
+    let Some(row_ids) = batch.column_by_name(ROW_ID) else {
+        return Err(Error::invalid_input(format!(
+            "partition artifact batch must contain {ROW_ID}"
+        )));
+    };
+    if row_ids.data_type() != &DataType::UInt64 {
+        return Err(Error::invalid_input(format!(
+            "partition artifact batch column {ROW_ID} must be uint64, got {}",
+            row_ids.data_type()
+        )));
+    }
+    let Some(part_ids) = batch.column_by_name(PART_ID_COLUMN) else {
+        return Err(Error::invalid_input(format!(
+            "partition artifact batch must contain {PART_ID_COLUMN}"
+        )));
+    };
+    if part_ids.data_type() != &DataType::UInt32 {
+        return Err(Error::invalid_input(format!(
+            "partition artifact batch column {PART_ID_COLUMN} must be uint32, got {}",
+            part_ids.data_type()
+        )));
+    }
+    let Some(pq_codes) = batch.column_by_name(PQ_CODE_COLUMN) else {
+        return Err(Error::invalid_input(format!(
+            "partition artifact batch must contain {PQ_CODE_COLUMN}"
+        )));
+    };
+    match pq_codes.data_type() {
+        DataType::FixedSizeList(_, width) if *width as usize == pq_code_width => Ok(()),
+        other => Err(Error::invalid_input(format!(
+            "partition artifact batch column {PQ_CODE_COLUMN} must be fixed_size_list<uint8>[{}], got {}",
+            pq_code_width, other
+        ))),
+    }
+}
+
+async fn write_json<T: Serialize>(
+    object_store: &ObjectStore,
+    path: &Path,
+    value: &T,
+) -> Result<()> {
+    let bytes = serde_json::to_vec(value).map_err(|error| {
+        Error::invalid_input(format!(
+            "failed to serialize partition artifact manifest '{}': {}",
+            path, error
+        ))
+    })?;
+    let mut writer = object_store.create(path).await?;
+    writer.write_all(&bytes).await?;
+    Writer::shutdown(writer.as_mut()).await?;
+    Ok(())
+}
+
+impl PartitionArtifactShuffleReader {
+    pub(crate) async fn try_open(
+        uri: &str,
+        storage_options: Option<&HashMap<String, String>>,
+    ) -> Result<Self> {
+        let registry = Arc::new(ObjectStoreRegistry::default());
+        let params = if let Some(storage_options) = storage_options {
+            ObjectStoreParams {
+                storage_options_accessor: Some(Arc::new(
+                    lance_io::object_store::StorageOptionsAccessor::with_static_options(
+                        storage_options.clone(),
+                    ),
+                )),
+                ..Default::default()
+            }
+        } else {
+            ObjectStoreParams::default()
+        };
+        let (object_store, root_dir) =
+            ObjectStore::from_uri_and_params(registry, uri, &params).await?;
+        Self::try_open_with_store(object_store, root_dir).await
+    }
+
+    async fn try_open_with_store(object_store: Arc<ObjectStore>, root_dir: Path) -> Result<Self> {
+        let manifest_path = root_dir.child("manifest.json");
+        let manifest_bytes = object_store.read_one_all(&manifest_path).await?;
+        let manifest: PartitionArtifactManifest =
+            serde_json::from_slice(&manifest_bytes).map_err(|error| {
+                Error::invalid_input(format!(
+                    "failed to parse partition artifact manifest '{}': {}",
+                    manifest_path, error
+                ))
+            })?;
+        if manifest.version != 1 {
+            return Err(Error::invalid_input(format!(
+                "unsupported partition artifact manifest version {}",
+                manifest.version
+            )));
+        }
+        if manifest.partitions.len() != manifest.num_partitions {
+            return Err(Error::invalid_input(format!(
+                "partition artifact manifest has {} partitions but num_partitions is {}",
+                manifest.partitions.len(),
+                manifest.num_partitions
+            )));
+        }
+
+        let scheduler = ScanScheduler::new(
+            object_store.clone(),
+            SchedulerConfig::max_bandwidth(&object_store),
+        );
+        Ok(Self {
+            scheduler,
+            root_dir,
+            partitions: manifest.partitions,
+            total_loss: manifest.total_loss,
+            file_readers: Mutex::new(HashMap::new()),
+        })
+    }
+
+    async fn open_file_reader(&self, relative_path: &str) -> Result<Arc<FileReader>> {
+        if let Some(reader) = self
+            .file_readers
+            .lock()
+            .expect("partition artifact reader mutex poisoned")
+            .get(relative_path)
+            .cloned()
+        {
+            return Ok(reader);
+        }
+
+        let path = join_relative_path(&self.root_dir, relative_path);
+        let reader = Arc::new(
+            FileReader::try_open(
+                self.scheduler
+                    .open_file(&path, &CachedFileSize::unknown())
+                    .await?,
+                None,
+                Arc::<DecoderPlugins>::default(),
+                &LanceCache::no_cache(),
+                FileReaderOptions::default(),
+            )
+            .await?,
+        );
+        self.file_readers
+            .lock()
+            .expect("partition artifact reader mutex poisoned")
+            .insert(relative_path.to_string(), reader.clone());
+        Ok(reader)
+    }
+}
+
+fn join_relative_path(root_dir: &Path, relative_path: &str) -> Path {
+    relative_path
+        .split('/')
+        .filter(|segment| !segment.is_empty())
+        .fold(root_dir.clone(), |path, segment| path.child(segment))
+}
+
+#[async_trait::async_trait]
+impl ShuffleReader for PartitionArtifactShuffleReader {
+    async fn read_partition(
+        &self,
+        partition_id: usize,
+    ) -> Result<Option<Box<dyn RecordBatchStream + Unpin + 'static>>> {
+        let Some(partition) = self.partitions.get(partition_id) else {
+            return Ok(None);
+        };
+        if partition.num_rows == 0 {
+            return Ok(None);
+        }
+        let path = partition.path.as_ref().ok_or_else(|| {
+            Error::invalid_input(format!(
+                "partition artifact partition {} has {} rows but no path",
+                partition_id, partition.num_rows
+            ))
+        })?;
+        if partition.ranges.is_empty() {
+            return Err(Error::invalid_input(format!(
+                "partition artifact partition {} has {} rows but no ranges",
+                partition_id, partition.num_rows
+            )));
+        }
+
+        let reader = self.open_file_reader(path).await?;
+        let ranges = partition
+            .ranges
+            .iter()
+            .map(|range| Range {
+                start: range.offset,
+                end: range.offset + range.num_rows,
+            })
+            .collect::<Vec<_>>();
+        let schema = Arc::new(reader.schema().as_ref().into());
+        Ok(Some(Box::new(RecordBatchStreamAdapter::new(
+            schema,
+            reader.read_stream(
+                ReadBatchParams::Ranges(ranges.into()),
+                u32::MAX,
+                16,
+                FilterExpression::no_filter(),
+            )?,
+        ))))
+    }
+
+    fn partition_size(&self, partition_id: usize) -> Result<usize> {
+        Ok(self
+            .partitions
+            .get(partition_id)
+            .map(|partition| partition.num_rows)
+            .unwrap_or(0))
+    }
+
+    fn total_loss(&self) -> Option<f64> {
+        self.total_loss
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::fs;
+
+    use arrow_array::cast::AsArray;
+    use arrow_array::{FixedSizeListArray, RecordBatch, UInt8Array, UInt64Array};
+    use futures::TryStreamExt;
+    use lance_arrow::FixedSizeListArrayExt;
+    use lance_core::ROW_ID;
+    use lance_core::datatypes::Schema;
+    use lance_file::writer::{FileWriter, FileWriterOptions};
+    use lance_io::object_store::ObjectStore;
+
+    use crate::Error;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn partition_artifact_builder_compacts_runs_into_single_partition_range() {
+        let tempdir = tempfile::tempdir().unwrap();
+        let root_dir = tempdir.path().join("artifact");
+        fs::create_dir_all(&root_dir).unwrap();
+        let object_store = Arc::new(ObjectStore::local());
+        let root_path = Path::from_filesystem_path(&root_dir).unwrap();
+
+        let mut builder = PartitionArtifactBuilder::try_new_with_store(
+            object_store.clone(),
+            root_path.clone(),
+            300,
+            2,
+        )
+        .unwrap();
+        let schema = Arc::new(ArrowSchema::new(vec![
+            Field::new(ROW_ID, DataType::UInt64, false),
+            Field::new(PART_ID_COLUMN, DataType::UInt32, false),
+            Field::new(
+                PQ_CODE_COLUMN,
+                DataType::FixedSizeList(Arc::new(Field::new("item", DataType::UInt8, true)), 2),
+                true,
+            ),
+        ]));
+
+        let batch1 = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(UInt64Array::from(vec![10_u64, 11, 12, 13])),
+                Arc::new(UInt32Array::from(vec![0_u32, 256, 0, 256])),
+                Arc::new(
+                    FixedSizeListArray::try_new_from_values(
+                        UInt8Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8]),
+                        2,
+                    )
+                    .unwrap(),
+                ),
+            ],
+        )
+        .unwrap();
+        let batch2 = RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(UInt64Array::from(vec![14_u64, 15])),
+                Arc::new(UInt32Array::from(vec![1_u32, 256])),
+                Arc::new(
+                    FixedSizeListArray::try_new_from_values(
+                        UInt8Array::from(vec![9, 10, 11, 12]),
+                        2,
+                    )
+                    .unwrap(),
+                ),
+            ],
+        )
+        .unwrap();
+        builder.append_batch(&batch1).await.unwrap();
+        builder.append_batch(&batch2).await.unwrap();
+        let artifact_files = builder.finish("metadata.lance", Some(2.5)).await.unwrap();
+        assert_eq!(artifact_files[0], "manifest.json");
+        assert!(
+            artifact_files
+                .iter()
+                .any(|path| path.ends_with("bucket-00000.lance"))
+        );
+
+        let manifest: PartitionArtifactManifest =
+            serde_json::from_slice(&fs::read(root_dir.join("manifest.json")).unwrap()).unwrap();
+        assert_eq!(manifest.version, 1);
+        assert_eq!(manifest.metadata_file.as_deref(), Some("metadata.lance"));
+        assert_eq!(manifest.total_loss, Some(2.5));
+        assert_eq!(manifest.partitions[0].num_rows, 2);
+        assert_eq!(manifest.partitions[0].ranges.len(), 1);
+        assert_eq!(manifest.partitions[1].num_rows, 1);
+        assert_eq!(manifest.partitions[1].ranges.len(), 1);
+        assert_eq!(manifest.partitions[256].num_rows, 3);
+        assert_eq!(manifest.partitions[256].ranges.len(), 1);
+        assert_eq!(
+            manifest.partitions[0].path, manifest.partitions[256].path,
+            "partitions sharing a bucket should share one final file"
+        );
+
+        let reader = PartitionArtifactShuffleReader::try_open_with_store(object_store, root_path)
+            .await
+            .unwrap();
+        let partition_0 = reader
+            .read_partition(0)
+            .await
+            .unwrap()
+            .unwrap()
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+        let partition_0_row_ids = partition_0
+            .iter()
+            .flat_map(|batch| {
+                batch[ROW_ID]
+                    .as_primitive::<arrow::datatypes::UInt64Type>()
+                    .values()
+                    .iter()
+                    .copied()
+            })
+            .collect::<Vec<_>>();
+        assert_eq!(partition_0_row_ids, vec![10, 12]);
+
+        let partition_256 = reader
+            .read_partition(256)
+            .await
+            .unwrap()
+            .unwrap()
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+        let partition_256_row_ids = partition_256
+            .iter()
+            .flat_map(|batch| {
+                batch[ROW_ID]
+                    .as_primitive::<arrow::datatypes::UInt64Type>()
+                    .values()
+                    .iter()
+                    .copied()
+            })
+            .collect::<Vec<_>>();
+        assert_eq!(partition_256_row_ids, vec![11, 13, 15]);
+    }
+
+    #[tokio::test]
+    async fn partition_artifact_reader_reads_partition_ranges() {
+        let tempdir = tempfile::tempdir().unwrap();
+        let root_dir = tempdir.path().join("artifact");
+        fs::create_dir_all(root_dir.join("partitions")).unwrap();
+
+        let object_store = Arc::new(ObjectStore::local());
+        let root_path = Path::from_filesystem_path(&root_dir).unwrap();
+        let partition_path = root_path.child("partitions").child("bucket-00000.lance");
+        let schema = Arc::new(arrow_schema::Schema::new(vec![
+            arrow_schema::Field::new(ROW_ID, arrow_schema::DataType::UInt64, false),
+            arrow_schema::Field::new(
+                lance_index::vector::PQ_CODE_COLUMN,
+                arrow_schema::DataType::FixedSizeList(
+                    Arc::new(arrow_schema::Field::new(
+                        "item",
+                        arrow_schema::DataType::UInt8,
+                        true,
+                    )),
+                    2,
+                ),
+                true,
+            ),
+        ]));
+        let mut writer = FileWriter::try_new(
+            object_store.create(&partition_path).await.unwrap(),
+            Schema::try_from(schema.as_ref()).unwrap(),
+            FileWriterOptions::default(),
+        )
+        .unwrap();
+        let batch1 = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(UInt64Array::from(vec![10_u64, 11, 12])),
+                Arc::new(
+                    FixedSizeListArray::try_new_from_values(
+                        UInt8Array::from(vec![1, 2, 3, 4, 5, 6]),
+                        2,
+                    )
+                    .unwrap(),
+                ),
+            ],
+        )
+        .unwrap();
+        let batch2 = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(UInt64Array::from(vec![13_u64, 14])),
+                Arc::new(
+                    FixedSizeListArray::try_new_from_values(UInt8Array::from(vec![7, 8, 9, 10]), 2)
+                        .unwrap(),
+                ),
+            ],
+        )
+        .unwrap();
+        writer.write_batch(&batch1).await.unwrap();
+        writer.write_batch(&batch2).await.unwrap();
+        writer.finish().await.unwrap();
+
+        let manifest = serde_json::json!({
+            "version": 1,
+            "num_partitions": 3,
+            "total_loss": 1.5,
+            "partitions": [
+                {
+                    "path": "partitions/bucket-00000.lance",
+                    "num_rows": 2,
+                    "ranges": [
+                        {"offset": 0, "num_rows": 1},
+                        {"offset": 3, "num_rows": 1},
+                    ],
+                },
+                {
+                    "path": "partitions/bucket-00000.lance",
+                    "num_rows": 2,
+                    "ranges": [
+                        {"offset": 1, "num_rows": 2},
+                    ],
+                },
+                {
+                    "num_rows": 0,
+                    "ranges": [],
+                },
+            ],
+        });
+        fs::write(
+            root_dir.join("manifest.json"),
+            serde_json::to_vec(&manifest).unwrap(),
+        )
+        .unwrap();
+
+        let reader = PartitionArtifactShuffleReader::try_open_with_store(object_store, root_path)
+            .await
+            .unwrap();
+        assert_eq!(reader.partition_size(0).unwrap(), 2);
+        assert_eq!(reader.partition_size(1).unwrap(), 2);
+        assert_eq!(reader.partition_size(2).unwrap(), 0);
+        assert_eq!(reader.total_loss(), Some(1.5));
+
+        let stream = reader.read_partition(0).await.unwrap().unwrap();
+        let batches = stream.try_collect::<Vec<_>>().await.unwrap();
+        let row_ids = batches
+            .iter()
+            .flat_map(|batch| {
+                batch[ROW_ID]
+                    .as_primitive::<arrow::datatypes::UInt64Type>()
+                    .values()
+                    .iter()
+                    .copied()
+            })
+            .collect::<Vec<_>>();
+        assert_eq!(row_ids, vec![10, 13]);
+        assert!(reader.read_partition(2).await.unwrap().is_none());
+    }
+
+    #[tokio::test]
+    async fn partition_artifact_reader_rejects_missing_partition_entry() {
+        let tempdir = tempfile::tempdir().unwrap();
+        let root_dir = tempdir.path().join("artifact");
+        fs::create_dir_all(&root_dir).unwrap();
+        let manifest = serde_json::json!({
+            "version": 1,
+            "num_partitions": 2,
+            "partitions": [{"num_rows": 0, "ranges": []}],
+        });
+        fs::write(
+            root_dir.join("manifest.json"),
+            serde_json::to_vec(&manifest).unwrap(),
+        )
+        .unwrap();
+
+        let error = PartitionArtifactShuffleReader::try_open_with_store(
+            Arc::new(ObjectStore::local()),
+            Path::from_filesystem_path(&root_dir).unwrap(),
+        )
+        .await
+        .unwrap_err();
+        assert!(matches!(error, Error::InvalidInput { .. }));
+    }
+}
diff --git a/rust/lance/src/index/vector/utils.rs b/rust/lance/src/index/vector/utils.rs
index 19156ac8eed..244a02c39bc 100644
--- a/rust/lance/src/index/vector/utils.rs
+++ b/rust/lance/src/index/vector/utils.rs
@@ -372,7 +372,7 @@ impl PartitionLoadLock {
 ///
 /// Handles both regular vector columns (FixedSizeList) and multivector columns
 /// (List\<FixedSizeList\>), flattening the latter.
-fn vector_column_to_fsl(batch: &RecordBatch, column: &str) -> Result<FixedSizeListArray> {
+pub fn vector_column_to_fsl(batch: &RecordBatch, column: &str) -> Result<FixedSizeListArray> {
     let array = get_column_from_batch(batch, column)?;
     match array.data_type() {
         arrow::datatypes::DataType::FixedSizeList(_, _) => Ok(array.as_fixed_size_list().clone()),

From c0af4918b16cd5584e305cc0b90b74a723d7d9bf Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Tue, 7 Apr 2026 17:20:13 +0800
Subject: [PATCH 15/21] refactor: decouple cuvs backend from main tree

---
 Cargo.toml                               |    2 +-
 python/Cargo.lock                        |  112 +-
 python/Cargo.toml                        |    2 -
 python/python/lance/cuvs.py              |   93 +-
 python/python/lance/dataset.py           |   39 +-
 python/python/tests/test_vector_index.py |  165 +--
 python/src/indices.rs                    |   91 --
 rust/lance-cuvs/Cargo.toml               |   28 -
 rust/lance-cuvs/src/lib.rs               | 1237 ----------------------
 9 files changed, 67 insertions(+), 1702 deletions(-)
 delete mode 100644 rust/lance-cuvs/Cargo.toml
 delete mode 100644 rust/lance-cuvs/src/lib.rs

diff --git a/Cargo.toml b/Cargo.toml
index bddb49ed4a3..c922eff6b8b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,7 +22,7 @@ members = [
     "rust/compression/bitpacking",
     "rust/arrow-scalar",
 ]
-exclude = ["python", "java/lance-jni", "rust/lance-cuvs"]
+exclude = ["python", "java/lance-jni"]
 # Python package needs to be built by maturin.
 resolver = "3"
 
diff --git a/python/Cargo.lock b/python/Cargo.lock
index aa4cfb72154..4507a617872 100644
--- a/python/Cargo.lock
+++ b/python/Cargo.lock
@@ -14,7 +14,7 @@ dependencies = [
  "core_extensions",
  "crossbeam-channel",
  "generational-arena",
- "libloading 0.7.4",
+ "libloading",
  "lock_api",
  "parking_lot",
  "paste",
@@ -1070,26 +1070,6 @@ dependencies = [
  "virtue",
 ]
 
-[[package]]
-name = "bindgen"
-version = "0.72.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
-dependencies = [
- "bitflags 2.11.0",
- "cexpr",
- "clang-sys",
- "itertools 0.13.0",
- "log",
- "prettyplease",
- "proc-macro2",
- "quote",
- "regex",
- "rustc-hash",
- "shlex",
- "syn 2.0.117",
-]
-
 [[package]]
 name = "bitflags"
 version = "1.3.2"
@@ -1289,15 +1269,6 @@ version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4f4c707c6a209cbe82d10abd08e1ea8995e9ea937d2550646e02798948992be0"
 
-[[package]]
-name = "cexpr"
-version = "0.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
-dependencies = [
- "nom 7.1.3",
-]
-
 [[package]]
 name = "cfg-if"
 version = "1.0.4"
@@ -1344,17 +1315,6 @@ dependencies = [
  "inout",
 ]
 
-[[package]]
-name = "clang-sys"
-version = "1.8.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
-dependencies = [
- "glob",
- "libc",
- "libloading 0.8.9",
-]
-
 [[package]]
 name = "cmake"
 version = "0.1.58"
@@ -1617,26 +1577,6 @@ dependencies = [
  "memchr",
 ]
 
-[[package]]
-name = "cuvs"
-version = "26.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9778fa1e16f42539772496e9adba2a29c67dca84bcb0d247795f9cb3135ba87d"
-dependencies = [
- "cuvs-sys",
- "ndarray 0.15.6",
-]
-
-[[package]]
-name = "cuvs-sys"
-version = "26.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d4cad121da7a7ac908965352ffeac029a93fb0e3a1278a271f7204098b8724e9"
-dependencies = [
- "bindgen",
- "cmake",
-]
-
 [[package]]
 name = "darling"
 version = "0.20.11"
@@ -4045,30 +3985,6 @@ dependencies = [
  "url",
 ]
 
-[[package]]
-name = "lance-cuvs"
-version = "5.0.0-beta.2"
-dependencies = [
- "arrow",
- "arrow-array",
- "arrow-buffer",
- "arrow-schema",
- "cuvs",
- "cuvs-sys",
- "futures",
- "half",
- "lance",
- "lance-arrow",
- "lance-core",
- "lance-file",
- "lance-index",
- "lance-io",
- "lance-linalg",
- "log",
- "ndarray 0.16.1",
- "tokio",
-]
-
 [[package]]
 name = "lance-datafusion"
 version = "5.0.0-beta.2"
@@ -4251,7 +4167,7 @@ dependencies = [
  "lindera",
  "lindera-tantivy",
  "log",
- "ndarray 0.16.1",
+ "ndarray",
  "num-traits",
  "object_store",
  "prost",
@@ -4552,16 +4468,6 @@ dependencies = [
  "winapi",
 ]
 
-[[package]]
-name = "libloading"
-version = "0.8.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55"
-dependencies = [
- "cfg-if",
- "windows-link",
-]
-
 [[package]]
 name = "liblzma"
 version = "0.4.6"
@@ -4988,19 +4894,6 @@ version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b"
 
-[[package]]
-name = "ndarray"
-version = "0.15.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32"
-dependencies = [
- "matrixmultiply",
- "num-complex",
- "num-integer",
- "num-traits",
- "rawpointer",
-]
-
 [[package]]
 name = "ndarray"
 version = "0.16.1"
@@ -5796,7 +5689,6 @@ dependencies = [
  "lance",
  "lance-arrow",
  "lance-core",
- "lance-cuvs",
  "lance-datafusion",
  "lance-datagen",
  "lance-encoding",
diff --git a/python/Cargo.toml b/python/Cargo.toml
index d63a8e113d4..a3542f7360f 100644
--- a/python/Cargo.toml
+++ b/python/Cargo.toml
@@ -35,7 +35,6 @@ lance = { path = "../rust/lance", features = [
 ] }
 lance-arrow = { path = "../rust/lance-arrow" }
 lance-core = { path = "../rust/lance-core" }
-lance-cuvs = { path = "../rust/lance-cuvs", optional = true }
 lance-datagen = { path = "../rust/lance-datagen", optional = true }
 lance-encoding = { path = "../rust/lance-encoding" }
 lance-file = { path = "../rust/lance-file" }
@@ -76,7 +75,6 @@ bytes = "1.4"
 
 [features]
 default = []
-cuvs = ["dep:lance-cuvs"]
 datagen = ["lance-datagen"]
 fp16kernels = ["lance/fp16kernels"]
 
diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py
index b36fd18c564..c32dc12b55c 100644
--- a/python/python/lance/cuvs.py
+++ b/python/python/lance/cuvs.py
@@ -38,28 +38,49 @@
 DEFAULT_PARTITION_ARTIFACT_BUCKETS = 256
 PARTITION_ARTIFACT_ROW_ID_COLUMN = "_rowid"
 
-try:
-    from . import lance as _lance_ext
+def build_vector_index_on_cuvs(
+    dataset,
+    column: str,
+    metric_type: str,
+    accelerator: str,
+    num_partitions: int,
+    num_sub_vectors: int,
+    dst_dataset_uri: str | Path | None = None,
+    *,
+    sample_rate: int = 256,
+    max_iters: int = 50,
+    num_bits: int = 8,
+    batch_size: int = 1024 * 128,
+    filter_nan: bool = True,
+):
+    if dst_dataset_uri is None:
+        dst_dataset_uri = tempfile.mkdtemp()
 
-    _assign_ivf_pq_on_cuvs_rust_impl = getattr(
-        _lance_ext.indices, "_assign_ivf_pq_on_cuvs_rust"
-    )
-    _train_ivf_pq_on_cuvs_rust_impl = getattr(
-        _lance_ext.indices, "_train_ivf_pq_on_cuvs_rust"
+    trained_index, ivf_centroids, pq_codebook = _train_ivf_pq_index_on_cuvs(
+        dataset,
+        column,
+        num_partitions,
+        metric_type,
+        accelerator,
+        num_sub_vectors=num_sub_vectors,
+        sample_rate=sample_rate,
+        max_iters=max_iters,
+        num_bits=num_bits,
+        filter_nan=filter_nan,
     )
-except (ImportError, AttributeError):
-    _assign_ivf_pq_on_cuvs_rust_impl = None
-    _train_ivf_pq_on_cuvs_rust_impl = None
-
-
-def _has_rust_cuvs_backend() -> bool:
-    return (
-        _train_ivf_pq_on_cuvs_rust_impl is not None
-        and _assign_ivf_pq_on_cuvs_rust_impl is not None
+    artifact_root, artifact_files = one_pass_assign_ivf_pq_on_cuvs(
+        dataset,
+        column,
+        metric_type,
+        accelerator,
+        ivf_centroids,
+        pq_codebook,
+        trained_index=trained_index,
+        dst_dataset_uri=dst_dataset_uri,
+        batch_size=batch_size,
+        filter_nan=filter_nan,
     )
-
-def _unwrap_dataset(dataset):
-    return getattr(dataset, "_ds", dataset)
+    return artifact_root, artifact_files, ivf_centroids, pq_codebook
 
 
 def is_cuvs_accelerator(accelerator: object) -> bool:
@@ -374,19 +395,6 @@ def _train_ivf_pq_index_on_cuvs(
     num_bits: int = 8,
     filter_nan: bool = True,
 ):
-    if _has_rust_cuvs_backend():
-        return _train_ivf_pq_on_cuvs_rust_impl(
-            _unwrap_dataset(dataset),
-            column,
-            num_partitions,
-            metric_type,
-            num_sub_vectors,
-            sample_rate=sample_rate,
-            max_iters=max_iters,
-            num_bits=num_bits,
-            filter_nan=filter_nan,
-        )
-
     if accelerator != "cuvs":
         raise ValueError("cuVS acceleration only supports accelerator='cuvs'")
     if num_bits != 8:
@@ -446,27 +454,6 @@ def one_pass_assign_ivf_pq_on_cuvs(
     *,
     filter_nan: bool = True,
 ):
-    if _has_rust_cuvs_backend():
-        if accelerator != "cuvs":
-            raise ValueError("cuVS acceleration only supports accelerator='cuvs'")
-        if trained_index is None:
-            raise ValueError(
-                "one_pass_assign_ivf_pq_on_cuvs requires a trained cuVS index for "
-                "single-node transform"
-            )
-        if dst_dataset_uri is None:
-            dst_dataset_uri = tempfile.mkdtemp()
-        artifact_files = _assign_ivf_pq_on_cuvs_rust_impl(
-            _unwrap_dataset(dataset),
-            column,
-            trained_index,
-            str(dst_dataset_uri),
-            batch_size=batch_size,
-            filter_nan=filter_nan,
-        )
-        LOGGER.info("Saved precomputed partition artifact to %s", dst_dataset_uri)
-        return str(dst_dataset_uri), artifact_files
-
     if accelerator != "cuvs":
         raise ValueError("cuVS acceleration only supports accelerator='cuvs'")
 
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
index 415ffdb8865..ee023048a1e 100644
--- a/python/python/lance/dataset.py
+++ b/python/python/lance/dataset.py
@@ -2960,48 +2960,29 @@ def _create_index_impl(
                 )
 
             if use_cuvs:
-                from .cuvs import (
-                    _train_ivf_pq_index_on_cuvs,
-                    one_pass_assign_ivf_pq_on_cuvs,
-                )
+                from .cuvs import build_vector_index_on_cuvs
 
-                LOGGER.info("Doing one-pass ivfpq cuVS training")
-                timers["ivf+pq_train:start"] = time.time()
-                trained_index, ivf_centroids, pq_codebook = _train_ivf_pq_index_on_cuvs(
+                LOGGER.info("Doing cuVS vector backend build")
+                timers["ivf+pq_build:start"] = time.time()
+                artifact_root, _, ivf_centroids, pq_codebook = build_vector_index_on_cuvs(
                     self,
                     column[0],
-                    num_partitions,
                     metric,
                     accelerator,
-                    num_sub_vectors=num_sub_vectors,
+                    num_partitions,
+                    num_sub_vectors,
                     sample_rate=kwargs.get("sample_rate", 256),
                     max_iters=kwargs.get("max_iters", 50),
                     num_bits=kwargs.get("num_bits", 8),
-                    filter_nan=filter_nan,
-                )
-                timers["ivf+pq_train:end"] = time.time()
-                ivfpq_train_time = (
-                    timers["ivf+pq_train:end"] - timers["ivf+pq_train:start"]
-                )
-                LOGGER.info("cuVS ivf+pq training time: %ss", ivfpq_train_time)
-                timers["ivf+pq_assign:start"] = time.time()
-                artifact_root, _ = one_pass_assign_ivf_pq_on_cuvs(
-                    self,
-                    column[0],
-                    metric,
-                    accelerator,
-                    ivf_centroids,
-                    pq_codebook,
-                    trained_index=trained_index,
                     batch_size=1024 * 128,
                     filter_nan=filter_nan,
                 )
                 kwargs["precomputed_partition_artifact_uri"] = artifact_root
-                timers["ivf+pq_assign:end"] = time.time()
-                ivfpq_assign_time = (
-                    timers["ivf+pq_assign:end"] - timers["ivf+pq_assign:start"]
+                timers["ivf+pq_build:end"] = time.time()
+                ivfpq_build_time = (
+                    timers["ivf+pq_build:end"] - timers["ivf+pq_build:start"]
                 )
-                LOGGER.info("cuVS ivf+pq transform time: %ss", ivfpq_assign_time)
+                LOGGER.info("cuVS ivf+pq build time: %ss", ivfpq_build_time)
             else:
                 from .vector import (
                     one_pass_assign_ivf_pq_on_accelerator,
diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py
index c2b42de2ac1..9aba519fa9c 100644
--- a/python/python/tests/test_vector_index.py
+++ b/python/python/tests/test_vector_index.py
@@ -30,8 +30,7 @@
 
 
 def _disable_rust_cuvs_backend(monkeypatch):
-    monkeypatch.setattr(lance_cuvs, "_train_ivf_pq_on_cuvs_rust_impl", None)
-    monkeypatch.setattr(lance_cuvs, "_assign_ivf_pq_on_cuvs_rust_impl", None)
+    del monkeypatch
 
 
 def create_table(nvec=1000, ndim=128, nans=0, nullify=False, dtype=np.float32):
@@ -549,22 +548,14 @@ def test_create_index_cuvs_dispatch(tmp_path, monkeypatch):
     dataset = lance.write_dataset(tbl, tmp_path)
     calls = {}
 
-    class FakeIndex:
-        pq_dim = 16
-        pq_bits = 8
-
-    def fake_train(
+    def fake_build(
         dataset_arg,
         column,
-        num_partitions,
         metric_type,
         accelerator,
+        num_partitions,
         num_sub_vectors,
-        *,
-        sample_rate,
-        max_iters,
-        num_bits,
-        filter_nan,
+        **kwargs,
     ):
         calls["dataset"] = dataset_arg
         calls["column"] = column
@@ -572,52 +563,16 @@ def fake_train(
         calls["metric_type"] = metric_type
         calls["accelerator"] = accelerator
         calls["num_sub_vectors"] = num_sub_vectors
-        calls["sample_rate"] = sample_rate
-        calls["max_iters"] = max_iters
-        calls["num_bits"] = num_bits
-        calls["filter_nan"] = filter_nan
-        return (
-            FakeIndex(),
-            np.random.randn(num_partitions, 128).astype(np.float32),
-            np.random.randn(num_sub_vectors, 256, 128 // num_sub_vectors).astype(
-                np.float32
-            ),
-        )
-
-    def fake_assign(
-        dataset_arg,
-        column,
-        metric_type,
-        accelerator,
-        ivf_centroids,
-        pq_codebook,
-        trained_index,
-        dst_path=None,
-        batch_size=20480,
-        *,
-        filter_nan,
-    ):
-        calls["assign_dataset"] = dataset_arg
-        calls["assign_column"] = column
-        calls["assign_metric_type"] = metric_type
-        calls["assign_accelerator"] = accelerator
-        calls["assign_ivf_centroids"] = ivf_centroids
-        calls["assign_pq_codebook"] = pq_codebook
-        calls["assign_trained_index"] = trained_index
-        calls["assign_batch_size"] = batch_size
-        calls["assign_filter_nan"] = filter_nan
+        calls["kwargs"] = kwargs
         return str(tmp_path / "cuvs_artifact"), [
             "manifest.json",
             "metadata.lance",
             "partitions/bucket-00000.lance",
-        ]
+        ], np.random.randn(num_partitions, 128).astype(np.float32), np.random.randn(
+            num_sub_vectors, 256, 128 // num_sub_vectors
+        ).astype(np.float32)
 
-    monkeypatch.setattr(lance_cuvs, "_train_ivf_pq_index_on_cuvs", fake_train)
-    monkeypatch.setattr(
-        lance_cuvs,
-        "one_pass_assign_ivf_pq_on_cuvs",
-        fake_assign,
-    )
+    monkeypatch.setattr(lance_cuvs, "build_vector_index_on_cuvs", fake_build)
 
     dataset = dataset.create_index(
         "vector",
@@ -632,10 +587,11 @@ def fake_assign(
     assert calls["metric_type"] == "L2"
     assert calls["accelerator"] == "cuvs"
     assert calls["num_sub_vectors"] == 16
-    assert calls["assign_column"] == "vector"
-    assert calls["assign_metric_type"] == "L2"
-    assert calls["assign_accelerator"] == "cuvs"
-    assert isinstance(calls["assign_trained_index"], FakeIndex)
+    assert calls["kwargs"]["sample_rate"] == 256
+    assert calls["kwargs"]["max_iters"] == 50
+    assert calls["kwargs"]["num_bits"] == 8
+    assert calls["kwargs"]["batch_size"] == 1024 * 128
+    assert calls["kwargs"]["filter_nan"] is True
     assert dataset.stats.index_stats("vector_idx")["index_type"] == "IVF_PQ"
 
 
@@ -741,55 +697,6 @@ def build(build_params, matrix):
     assert pq_codebook.shape == (4, 256, 4)
 
 
-def test_train_ivf_pq_on_cuvs_prefers_rust_backend(tmp_path, monkeypatch):
-    dataset = lance.write_dataset(create_table(nvec=32, ndim=16), tmp_path)
-    calls = {}
-
-    class FakeRustIndex:
-        pass
-
-    def fake_train(*args, **kwargs):
-        calls["args"] = args
-        calls["kwargs"] = kwargs
-        return (
-            FakeRustIndex(),
-            pa.FixedSizeListArray.from_arrays(
-                pa.array(np.arange(64, dtype=np.float32)), 16
-            ),
-            pa.FixedSizeListArray.from_arrays(
-                pa.array(np.arange(4 * 256 * 4, dtype=np.float32)), 4
-            ),
-        )
-
-    monkeypatch.setattr(lance_cuvs, "_train_ivf_pq_on_cuvs_rust_impl", fake_train)
-    monkeypatch.setattr(lance_cuvs, "_assign_ivf_pq_on_cuvs_rust_impl", object())
-    monkeypatch.setattr(
-        lance_cuvs,
-        "_require_cuvs",
-        lambda: (_ for _ in ()).throw(AssertionError("python cuVS backend should not run")),
-    )
-
-    trained_index, centroids, pq_codebook = lance_cuvs._train_ivf_pq_index_on_cuvs(
-        dataset,
-        "vector",
-        4,
-        "l2",
-        "cuvs",
-        4,
-        sample_rate=8,
-        max_iters=30,
-        num_bits=8,
-        filter_nan=True,
-    )
-
-    assert isinstance(trained_index, FakeRustIndex)
-    assert calls["args"][:5] == (dataset, "vector", 4, "l2", 4)
-    assert calls["kwargs"]["sample_rate"] == 8
-    assert calls["kwargs"]["max_iters"] == 30
-    assert isinstance(centroids, pa.FixedSizeListArray)
-    assert isinstance(pq_codebook, pa.FixedSizeListArray)
-
-
 def test_train_ivf_pq_on_cuvs_uses_num_sub_vectors_for_pq_dim(
     tmp_path, monkeypatch
 ):
@@ -958,50 +865,6 @@ def transform(index, vectors):
     assert bucket_table.column("__pq_code").type == pa.list_(pa.uint8(), 4)
 
 
-def test_one_pass_assign_ivf_pq_on_cuvs_prefers_rust_backend(tmp_path, monkeypatch):
-    dataset = lance.write_dataset(create_table(nvec=32, ndim=16), tmp_path / "cuvs_assign_rust")
-    calls = {}
-
-    class FakeRustIndex:
-        pass
-
-    def fake_assign(*args, **kwargs):
-        calls["args"] = args
-        calls["kwargs"] = kwargs
-        return ["manifest.json", "metadata.lance", "partitions/bucket-00000.lance"]
-
-    monkeypatch.setattr(lance_cuvs, "_train_ivf_pq_on_cuvs_rust_impl", object())
-    monkeypatch.setattr(lance_cuvs, "_assign_ivf_pq_on_cuvs_rust_impl", fake_assign)
-    monkeypatch.setattr(
-        lance_cuvs,
-        "_require_cuvs",
-        lambda: (_ for _ in ()).throw(AssertionError("python cuVS backend should not run")),
-    )
-
-    artifact_root, artifact_files = lance_cuvs.one_pass_assign_ivf_pq_on_cuvs(
-        dataset,
-        "vector",
-        "l2",
-        "cuvs",
-        np.random.randn(4, 16).astype(np.float32),
-        np.random.randn(4, 256, 4).astype(np.float32),
-        trained_index=FakeRustIndex(),
-        dst_dataset_uri=tmp_path / "artifact",
-        batch_size=4096,
-    )
-
-    assert artifact_root == str(tmp_path / "artifact")
-    assert artifact_files[0] == "manifest.json"
-    assert calls["args"][:4] == (
-        dataset,
-        "vector",
-        calls["args"][2],
-        str(tmp_path / "artifact"),
-    )
-    assert isinstance(calls["args"][2], FakeRustIndex)
-    assert calls["kwargs"]["batch_size"] == 4096
-
-
 def test_one_pass_assign_ivf_pq_on_cuvs_rejects_incompatible_transform_width(
     tmp_path,
     monkeypatch,
diff --git a/python/src/indices.rs b/python/src/indices.rs
index cb8288b51a5..cea7f2a968a 100644
--- a/python/src/indices.rs
+++ b/python/src/indices.rs
@@ -32,8 +32,6 @@ use pyo3::{
 };
 
 use lance::index::DatasetIndexInternalExt;
-#[cfg(feature = "cuvs")]
-use lance_cuvs::TrainedIvfPqIndex;
 
 use crate::fragment::FileFragment;
 use crate::utils::{PyJson, PyLance};
@@ -157,89 +155,6 @@ impl PyIvfModel {
     }
 }
 
-#[cfg(feature = "cuvs")]
-#[pyclass(name = "_CuvsIvfPqIndex", module = "lance.indices", unsendable)]
-pub struct PyCuvsIvfPqIndex {
-    inner: TrainedIvfPqIndex,
-}
-
-#[cfg(feature = "cuvs")]
-#[pyfunction]
-#[allow(clippy::too_many_arguments)]
-#[pyo3(
-    signature=(
-        dataset,
-        column,
-        num_partitions,
-        distance_type,
-        num_sub_vectors,
-        sample_rate=256,
-        max_iters=50,
-        num_bits=8,
-        filter_nan=true
-    )
-)]
-fn _train_ivf_pq_on_cuvs_rust<'py>(
-    py: Python<'py>,
-    dataset: &Dataset,
-    column: &str,
-    num_partitions: u32,
-    distance_type: &str,
-    num_sub_vectors: u32,
-    sample_rate: u32,
-    max_iters: u32,
-    num_bits: u8,
-    filter_nan: bool,
-) -> PyResult<(Py<PyCuvsIvfPqIndex>, Bound<'py, PyAny>, Bound<'py, PyAny>)> {
-    let distance_type = DistanceType::try_from(distance_type).unwrap();
-    let trained = rt()
-        .runtime
-        .block_on(lance_cuvs::train_ivf_pq(
-            dataset.ds.as_ref(),
-            column,
-            num_partitions as usize,
-            distance_type,
-            num_sub_vectors as usize,
-            sample_rate as usize,
-            max_iters as usize,
-            num_bits as usize,
-            filter_nan,
-        ))
-        .infer_error()?;
-    let ivf_centroids = trained.ivf_centroids().clone().into_data().to_pyarrow(py)?;
-    let pq_codebook = trained.pq_codebook().clone().into_data().to_pyarrow(py)?;
-    Ok((
-        Py::new(py, PyCuvsIvfPqIndex { inner: trained })?,
-        ivf_centroids,
-        pq_codebook,
-    ))
-}
-
-#[cfg(feature = "cuvs")]
-#[pyfunction]
-#[pyo3(signature=(dataset, column, trained_index, artifact_root, batch_size=1024 * 128, filter_nan=true))]
-fn _assign_ivf_pq_on_cuvs_rust(
-    py: Python<'_>,
-    dataset: &Dataset,
-    column: &str,
-    trained_index: &PyCuvsIvfPqIndex,
-    artifact_root: &str,
-    batch_size: usize,
-    filter_nan: bool,
-) -> PyResult<Vec<String>> {
-    let _ = py;
-    rt().runtime
-        .block_on(lance_cuvs::assign_ivf_pq_to_artifact(
-            dataset.ds.as_ref(),
-            column,
-            &trained_index.inner,
-            artifact_root,
-            batch_size,
-            filter_nan,
-        ))
-        .infer_error()
-}
-
 /// Internal helper to fetch an IVF model for the given index name.
 async fn do_get_ivf_model(dataset: &Dataset, index_name: &str) -> PyResult<IvfModel> {
     use lance_index::metrics::NoOpMetricsCollector;
@@ -801,12 +716,6 @@ pub fn register_indices(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
     indices.add_class::<PyIndexDescription>()?;
     indices.add_class::<PyIndexSegmentDescription>()?;
     indices.add_wrapped(wrap_pyfunction!(get_ivf_model))?;
-    #[cfg(feature = "cuvs")]
-    {
-        indices.add_class::<PyCuvsIvfPqIndex>()?;
-        indices.add_wrapped(wrap_pyfunction!(_train_ivf_pq_on_cuvs_rust))?;
-        indices.add_wrapped(wrap_pyfunction!(_assign_ivf_pq_on_cuvs_rust))?;
-    }
     m.add_submodule(&indices)?;
     Ok(())
 }
diff --git a/rust/lance-cuvs/Cargo.toml b/rust/lance-cuvs/Cargo.toml
deleted file mode 100644
index a001f82c16f..00000000000
--- a/rust/lance-cuvs/Cargo.toml
+++ /dev/null
@@ -1,28 +0,0 @@
-[package]
-name = "lance-cuvs"
-version = "5.0.0-beta.2"
-edition = "2024"
-authors = ["Lance Devs <dev@lance.org>"]
-license = "Apache-2.0"
-rust-version = "1.91"
-publish = false
-
-[dependencies]
-arrow = "57.0.0"
-arrow-array = "57.0.0"
-arrow-buffer = "57.0.0"
-arrow-schema = "57.0.0"
-cuvs = "26.2.0"
-cuvs-sys = "26.2.0"
-futures = "0.3"
-half = { version = "2.5", default-features = false, features = ["num-traits", "std"] }
-lance = { path = "../lance" }
-lance-arrow = { path = "../lance-arrow" }
-lance-core = { path = "../lance-core" }
-lance-file = { path = "../lance-file" }
-lance-index = { path = "../lance-index" }
-lance-io = { path = "../lance-io" }
-lance-linalg = { path = "../lance-linalg" }
-log = "0.4"
-ndarray = { version = "0.16.1", features = ["matrixmultiply-threading"] }
-tokio = { version = "1.48", features = ["rt-multi-thread"] }
diff --git a/rust/lance-cuvs/src/lib.rs b/rust/lance-cuvs/src/lib.rs
deleted file mode 100644
index db54ce47f22..00000000000
--- a/rust/lance-cuvs/src/lib.rs
+++ /dev/null
@@ -1,1237 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright The Lance Authors
-
-use std::ffi::{CStr, c_void};
-use std::marker::PhantomData;
-use std::ptr;
-use std::sync::Arc;
-use arrow::compute::filter;
-use arrow_array::cast::AsArray;
-use arrow_array::types::{Float16Type, Float32Type, Float64Type, UInt8Type};
-use arrow_array::{
-    Array, FixedSizeListArray, Float32Array, ListArray, RecordBatch, UInt8Array, UInt32Array,
-    UInt64Array,
-};
-use arrow_buffer::{OffsetBuffer, ScalarBuffer};
-use arrow_schema::{DataType, Field, Schema as ArrowSchema};
-use cuvs::Resources;
-use futures::TryStreamExt;
-use lance::dataset::Dataset;
-use lance::index::vector::PartitionArtifactBuilder;
-use lance::index::vector::utils::{infer_vector_dim, vector_column_to_fsl};
-use lance_arrow::{FixedSizeListArrayExt, RecordBatchExt};
-use lance_core::{Error, ROW_ID, Result};
-use lance_file::version::LanceFileVersion;
-use lance_file::writer::{FileWriter, FileWriterOptions};
-use lance_index::vector::utils::is_finite;
-use lance_index::vector::{LOSS_METADATA_KEY, PART_ID_COLUMN, PQ_CODE_COLUMN};
-use lance_linalg::distance::DistanceType;
-use log::warn;
-use ndarray::{Array2, ArrayView2};
-
-const PARTITION_ARTIFACT_METADATA_FILE_NAME: &str = "metadata.lance";
-const PARTITION_ARTIFACT_FILE_VERSION: &str = "2.2";
-const PIPELINE_SLOTS: usize = 2;
-
-type CudaEventHandle = *mut c_void;
-
-#[link(name = "cudart")]
-unsafe extern "C" {
-    fn cudaMallocHost(ptr: *mut *mut c_void, size: usize) -> cuvs_sys::cudaError_t;
-    fn cudaFreeHost(ptr: *mut c_void) -> cuvs_sys::cudaError_t;
-    fn cudaEventCreate(event: *mut CudaEventHandle) -> cuvs_sys::cudaError_t;
-    fn cudaEventDestroy(event: CudaEventHandle) -> cuvs_sys::cudaError_t;
-    fn cudaEventRecord(
-        event: CudaEventHandle,
-        stream: cuvs_sys::cudaStream_t,
-    ) -> cuvs_sys::cudaError_t;
-    fn cudaEventSynchronize(event: CudaEventHandle) -> cuvs_sys::cudaError_t;
-}
-
-pub struct TrainedIvfPqIndex {
-    resources: Resources,
-    index: CuvsIvfPqIndex,
-    num_partitions: usize,
-    dimension: usize,
-    num_sub_vectors: usize,
-    num_bits: usize,
-    metric_type: DistanceType,
-    ivf_centroids: FixedSizeListArray,
-    pq_codebook: FixedSizeListArray,
-}
-
-impl TrainedIvfPqIndex {
-    pub fn ivf_centroids(&self) -> &FixedSizeListArray {
-        &self.ivf_centroids
-    }
-
-    pub fn pq_codebook(&self) -> &FixedSizeListArray {
-        &self.pq_codebook
-    }
-
-    pub fn num_partitions(&self) -> usize {
-        self.num_partitions
-    }
-
-    pub fn pq_code_width(&self) -> usize {
-        self.num_sub_vectors
-    }
-
-    pub fn metric_type(&self) -> DistanceType {
-        self.metric_type
-    }
-
-    pub fn num_bits(&self) -> usize {
-        self.num_bits
-    }
-}
-
-struct CuvsIvfPqIndex {
-    raw: cuvs_sys::cuvsIvfPqIndex_t,
-}
-
-impl CuvsIvfPqIndex {
-    fn try_new() -> Result<Self> {
-        let mut raw = ptr::null_mut();
-        check_cuvs(
-            unsafe { cuvs_sys::cuvsIvfPqIndexCreate(&mut raw) },
-            "create IVF_PQ index",
-        )?;
-        Ok(Self { raw })
-    }
-}
-
-impl Drop for CuvsIvfPqIndex {
-    fn drop(&mut self) {
-        if !self.raw.is_null() {
-            let _ = unsafe { cuvs_sys::cuvsIvfPqIndexDestroy(self.raw) };
-        }
-    }
-}
-
-enum MatrixBuffer<'a> {
-    Borrowed {
-        values: &'a [f32],
-        rows: usize,
-        cols: usize,
-    },
-    Owned(Array2<f32>),
-}
-
-impl MatrixBuffer<'_> {
-    fn view(&self) -> Result<ArrayView2<'_, f32>> {
-        match self {
-            Self::Borrowed { values, rows, cols } => ArrayView2::from_shape((*rows, *cols), values)
-                .map_err(|error| {
-                    Error::io(format!("failed to create borrowed matrix view: {error}"))
-                }),
-            Self::Owned(array) => Ok(array.view()),
-        }
-    }
-
-    fn rows(&self) -> usize {
-        match self {
-            Self::Borrowed { rows, .. } => *rows,
-            Self::Owned(array) => array.nrows(),
-        }
-    }
-}
-
-struct HostTensorView {
-    shape: Vec<i64>,
-    tensor: cuvs_sys::DLManagedTensor,
-}
-
-impl HostTensorView {
-    fn try_new<T: DlElement>(shape: &[usize], data: *mut std::ffi::c_void) -> Self {
-        let shape = shape.iter().map(|dim| *dim as i64).collect::<Vec<_>>();
-        let tensor = cuvs_sys::DLManagedTensor {
-            dl_tensor: cuvs_sys::DLTensor {
-                data,
-                device: cuvs_sys::DLDevice {
-                    device_type: cuvs_sys::DLDeviceType::kDLCPU,
-                    device_id: 0,
-                },
-                ndim: shape.len() as i32,
-                dtype: T::dl_dtype(),
-                shape: shape.as_ptr() as *mut i64,
-                strides: ptr::null_mut(),
-                byte_offset: 0,
-            },
-            manager_ctx: ptr::null_mut(),
-            deleter: None,
-        };
-        Self { shape, tensor }
-    }
-
-    fn as_mut_ptr(&mut self) -> *mut cuvs_sys::DLManagedTensor {
-        debug_assert_eq!(self.shape.len(), self.tensor.dl_tensor.ndim as usize);
-        &mut self.tensor
-    }
-}
-
-trait DlElement: Copy + Default {
-    fn dl_dtype() -> cuvs_sys::DLDataType;
-}
-
-impl DlElement for f32 {
-    fn dl_dtype() -> cuvs_sys::DLDataType {
-        cuvs_sys::DLDataType {
-            code: cuvs_sys::DLDataTypeCode::kDLFloat as u8,
-            bits: 32,
-            lanes: 1,
-        }
-    }
-}
-
-impl DlElement for u8 {
-    fn dl_dtype() -> cuvs_sys::DLDataType {
-        cuvs_sys::DLDataType {
-            code: cuvs_sys::DLDataTypeCode::kDLUInt as u8,
-            bits: 8,
-            lanes: 1,
-        }
-    }
-}
-
-impl DlElement for u32 {
-    fn dl_dtype() -> cuvs_sys::DLDataType {
-        cuvs_sys::DLDataType {
-            code: cuvs_sys::DLDataTypeCode::kDLUInt as u8,
-            bits: 32,
-            lanes: 1,
-        }
-    }
-}
-
-struct DeviceTensor<T: DlElement> {
-    shape: Vec<i64>,
-    tensor: cuvs_sys::DLManagedTensor,
-    capacity_bytes: usize,
-    resources: cuvs_sys::cuvsResources_t,
-    _marker: PhantomData<T>,
-}
-
-impl<T: DlElement> DeviceTensor<T> {
-    fn try_new(resources: &Resources, shape: &[usize]) -> Result<Self> {
-        let capacity_bytes = shape.iter().product::<usize>() * std::mem::size_of::<T>();
-        let mut data = ptr::null_mut();
-        check_cuvs(
-            unsafe { cuvs_sys::cuvsRMMAlloc(resources.0, &mut data, capacity_bytes) },
-            "allocate device tensor",
-        )?;
-        let shape = shape.iter().map(|dim| *dim as i64).collect::<Vec<_>>();
-        let tensor = cuvs_sys::DLManagedTensor {
-            dl_tensor: cuvs_sys::DLTensor {
-                data,
-                device: cuvs_sys::DLDevice {
-                    device_type: cuvs_sys::DLDeviceType::kDLCUDA,
-                    device_id: 0,
-                },
-                ndim: shape.len() as i32,
-                dtype: T::dl_dtype(),
-                shape: shape.as_ptr() as *mut i64,
-                strides: ptr::null_mut(),
-                byte_offset: 0,
-            },
-            manager_ctx: ptr::null_mut(),
-            deleter: None,
-        };
-        Ok(Self {
-            shape,
-            tensor,
-            capacity_bytes,
-            resources: resources.0,
-            _marker: PhantomData,
-        })
-    }
-
-    fn as_mut_ptr(&mut self) -> *mut cuvs_sys::DLManagedTensor {
-        debug_assert_eq!(self.shape.len(), self.tensor.dl_tensor.ndim as usize);
-        &mut self.tensor
-    }
-
-    fn set_shape(&mut self, shape: &[usize]) -> Result<()> {
-        if shape.len() != self.shape.len() {
-            return Err(Error::io(format!(
-                "device tensor rank mismatch: expected {}, got {}",
-                self.shape.len(),
-                shape.len()
-            )));
-        }
-        let required_bytes = shape.iter().product::<usize>() * std::mem::size_of::<T>();
-        if required_bytes > self.capacity_bytes {
-            return Err(Error::io(format!(
-                "device tensor capacity {} bytes is smaller than requested shape {:?} ({} bytes)",
-                self.capacity_bytes, shape, required_bytes
-            )));
-        }
-        for (dst, src) in self.shape.iter_mut().zip(shape) {
-            *dst = *src as i64;
-        }
-        Ok(())
-    }
-
-    fn current_len(&self) -> usize {
-        self.shape.iter().map(|dim| *dim as usize).product()
-    }
-
-    fn current_bytes(&self) -> usize {
-        self.current_len() * std::mem::size_of::<T>()
-    }
-
-    fn copy_from_host_async(&mut self, resources: &Resources, src: &[T]) -> Result<()> {
-        let expected_len = self.current_len();
-        if src.len() != expected_len {
-            return Err(Error::io(format!(
-                "device tensor copy expects {expected_len} elements, got {}",
-                src.len()
-            )));
-        }
-        check_cuda(
-            unsafe {
-                cuvs_sys::cudaMemcpyAsync(
-                    self.tensor.dl_tensor.data,
-                    src.as_ptr() as *const _,
-                    self.current_bytes(),
-                    cuvs_sys::cudaMemcpyKind_cudaMemcpyDefault,
-                    resources
-                        .get_cuda_stream()
-                        .map_err(|e| Error::io(e.to_string()))?,
-                )
-            },
-            "copy host tensor to device",
-        )
-    }
-
-    fn copy_to_host_async(&self, resources: &Resources, dst: &mut [T]) -> Result<()> {
-        let expected_len = self.current_len();
-        if dst.len() != expected_len {
-            return Err(Error::io(format!(
-                "device tensor copy expects destination length {expected_len}, got {}",
-                dst.len()
-            )));
-        }
-        check_cuda(
-            unsafe {
-                cuvs_sys::cudaMemcpyAsync(
-                    dst.as_mut_ptr() as *mut _,
-                    self.tensor.dl_tensor.data,
-                    self.current_bytes(),
-                    cuvs_sys::cudaMemcpyKind_cudaMemcpyDefault,
-                    resources
-                        .get_cuda_stream()
-                        .map_err(|e| Error::io(e.to_string()))?,
-                )
-            },
-            "copy device tensor to host",
-        )
-    }
-}
-
-impl<T: DlElement> Drop for DeviceTensor<T> {
-    fn drop(&mut self) {
-        if !self.tensor.dl_tensor.data.is_null() {
-            let _ = unsafe {
-                cuvs_sys::cuvsRMMFree(
-                    self.resources,
-                    self.tensor.dl_tensor.data,
-                    self.capacity_bytes,
-                )
-            };
-        }
-    }
-}
-
-struct PinnedHostBuffer<T> {
-    ptr: *mut T,
-    len: usize,
-    _marker: PhantomData<T>,
-}
-
-impl<T: Copy> PinnedHostBuffer<T> {
-    fn try_new(len: usize) -> Result<Self> {
-        let bytes = len
-            .checked_mul(std::mem::size_of::<T>())
-            .ok_or_else(|| Error::io("pinned host allocation size overflow"))?;
-        let mut raw = ptr::null_mut();
-        check_cuda(
-            unsafe { cudaMallocHost(&mut raw, bytes) },
-            "allocate pinned host buffer",
-        )?;
-        Ok(Self {
-            ptr: raw.cast::<T>(),
-            len,
-            _marker: PhantomData,
-        })
-    }
-
-    fn as_slice(&self) -> &[T] {
-        unsafe { std::slice::from_raw_parts(self.ptr, self.len) }
-    }
-
-    fn as_mut_slice(&mut self) -> &mut [T] {
-        unsafe { std::slice::from_raw_parts_mut(self.ptr, self.len) }
-    }
-
-    fn prefix(&self, len: usize) -> Result<&[T]> {
-        if len > self.len {
-            return Err(Error::io(format!(
-                "pinned host buffer length {} is smaller than requested prefix {}",
-                self.len, len
-            )));
-        }
-        Ok(&self.as_slice()[..len])
-    }
-
-    fn prefix_mut(&mut self, len: usize) -> Result<&mut [T]> {
-        if len > self.len {
-            return Err(Error::io(format!(
-                "pinned host buffer length {} is smaller than requested prefix {}",
-                self.len, len
-            )));
-        }
-        Ok(&mut self.as_mut_slice()[..len])
-    }
-
-    fn copy_from_slice(&mut self, src: &[T]) -> Result<()> {
-        if src.len() > self.len {
-            return Err(Error::io(format!(
-                "pinned host buffer length {} is smaller than source length {}",
-                self.len,
-                src.len()
-            )));
-        }
-        self.prefix_mut(src.len())?.copy_from_slice(src);
-        Ok(())
-    }
-}
-
-impl<T> Drop for PinnedHostBuffer<T> {
-    fn drop(&mut self) {
-        if !self.ptr.is_null() {
-            let _ = unsafe { cudaFreeHost(self.ptr.cast::<c_void>()) };
-        }
-    }
-}
-
-struct CudaEvent {
-    raw: CudaEventHandle,
-}
-
-impl CudaEvent {
-    fn try_new() -> Result<Self> {
-        let mut raw = ptr::null_mut();
-        check_cuda(unsafe { cudaEventCreate(&mut raw) }, "create CUDA event")?;
-        Ok(Self { raw })
-    }
-
-    fn record(&self, stream: cuvs_sys::cudaStream_t) -> Result<()> {
-        check_cuda(
-            unsafe { cudaEventRecord(self.raw, stream) },
-            "record CUDA event",
-        )
-    }
-
-    fn synchronize(&self) -> Result<()> {
-        check_cuda(
-            unsafe { cudaEventSynchronize(self.raw) },
-            "synchronize CUDA event",
-        )
-    }
-
-}
-
-impl Drop for CudaEvent {
-    fn drop(&mut self) {
-        if !self.raw.is_null() {
-            let _ = unsafe { cudaEventDestroy(self.raw) };
-        }
-    }
-}
-
-fn check_cuvs(status: cuvs_sys::cuvsError_t, context: &str) -> Result<()> {
-    if status == cuvs_sys::cuvsError_t::CUVS_SUCCESS {
-        return Ok(());
-    }
-
-    let message = unsafe {
-        let text = cuvs_sys::cuvsGetLastErrorText();
-        if text.is_null() {
-            format!("{status:?}")
-        } else {
-            format!(
-                "{status:?}: {}",
-                CStr::from_ptr(text).to_string_lossy().into_owned()
-            )
-        }
-    };
-    Err(Error::io(format!("cuVS failed to {context}: {message}")))
-}
-
-fn check_cuda(status: cuvs_sys::cudaError_t, context: &str) -> Result<()> {
-    if status == cuvs_sys::cudaError::cudaSuccess {
-        Ok(())
-    } else {
-        Err(Error::io(format!("CUDA failed to {context}: {status:?}")))
-    }
-}
-
-fn cuvs_distance_type(metric_type: DistanceType) -> Result<cuvs_sys::cuvsDistanceType> {
-    match metric_type {
-        DistanceType::L2 => Ok(cuvs_sys::cuvsDistanceType::L2Expanded),
-        DistanceType::Cosine => Ok(cuvs_sys::cuvsDistanceType::CosineExpanded),
-        DistanceType::Dot => Ok(cuvs_sys::cuvsDistanceType::InnerProduct),
-        other => Err(Error::not_supported(format!(
-            "cuVS IVF_PQ does not support metric {other:?}"
-        ))),
-    }
-}
-
-fn create_index_params(
-    metric_type: DistanceType,
-    num_partitions: usize,
-    num_sub_vectors: usize,
-    sample_rate: usize,
-    max_iters: usize,
-    num_bits: usize,
-) -> Result<cuvs_sys::cuvsIvfPqIndexParams_t> {
-    let mut params = ptr::null_mut();
-    check_cuvs(
-        unsafe { cuvs_sys::cuvsIvfPqIndexParamsCreate(&mut params) },
-        "allocate IVF_PQ index params",
-    )?;
-    let metric = cuvs_distance_type(metric_type)?;
-    unsafe {
-        (*params).metric = metric;
-        (*params).metric_arg = 0.0;
-        (*params).add_data_on_build = false;
-        (*params).n_lists = num_partitions as u32;
-        (*params).kmeans_n_iters = max_iters as u32;
-        (*params).kmeans_trainset_fraction = 1.0;
-        (*params).pq_bits = num_bits as u32;
-        (*params).pq_dim = num_sub_vectors as u32;
-        (*params).codebook_kind =
-            cuvs_sys::cuvsIvfPqCodebookGen::CUVS_IVF_PQ_CODEBOOK_GEN_PER_SUBSPACE;
-        (*params).force_random_rotation = false;
-        (*params).conservative_memory_allocation = false;
-        (*params).max_train_points_per_pq_code = sample_rate as u32;
-        (*params).codes_layout = cuvs_sys::cuvsIvfPqListLayout::CUVS_IVF_PQ_LIST_LAYOUT_FLAT;
-    }
-    Ok(params)
-}
-
-fn destroy_index_params(params: cuvs_sys::cuvsIvfPqIndexParams_t) {
-    if !params.is_null() {
-        let _ = unsafe { cuvs_sys::cuvsIvfPqIndexParamsDestroy(params) };
-    }
-}
-
-fn make_tensor_view() -> HostTensorView {
-    let shape = Vec::new();
-    let tensor = cuvs_sys::DLManagedTensor {
-        dl_tensor: cuvs_sys::DLTensor {
-            data: ptr::null_mut(),
-            device: cuvs_sys::DLDevice {
-                device_type: cuvs_sys::DLDeviceType::kDLCPU,
-                device_id: 0,
-            },
-            ndim: 0,
-            dtype: <f32 as DlElement>::dl_dtype(),
-            shape: shape.as_ptr() as *mut i64,
-            strides: ptr::null_mut(),
-            byte_offset: 0,
-        },
-        manager_ctx: ptr::null_mut(),
-        deleter: None,
-    };
-    HostTensorView { shape, tensor }
-}
-
-fn tensor_shape(tensor: &cuvs_sys::DLManagedTensor) -> Vec<usize> {
-    let dl_tensor = &tensor.dl_tensor;
-    (0..dl_tensor.ndim)
-        .map(|idx| unsafe { *dl_tensor.shape.add(idx as usize) as usize })
-        .collect()
-}
-
-fn tensor_num_bytes(tensor: &cuvs_sys::DLManagedTensor) -> usize {
-    let shape = tensor_shape(tensor);
-    let numel = shape.into_iter().product::<usize>();
-    numel * ((tensor.dl_tensor.dtype.bits as usize) / 8)
-}
-
-fn copy_tensor_to_host_f32_2d(
-    resources: &Resources,
-    tensor: &cuvs_sys::DLManagedTensor,
-) -> Result<Array2<f32>> {
-    let shape = tensor_shape(tensor);
-    if shape.len() != 2 {
-        return Err(Error::io(format!(
-            "expected 2D tensor, got shape {shape:?}"
-        )));
-    }
-    let mut array = Array2::<f32>::zeros((shape[0], shape[1]));
-    check_cuda(
-        unsafe {
-            cuvs_sys::cudaMemcpyAsync(
-                array.as_mut_ptr() as *mut _,
-                tensor.dl_tensor.data,
-                tensor_num_bytes(tensor),
-                cuvs_sys::cudaMemcpyKind_cudaMemcpyDefault,
-                resources
-                    .get_cuda_stream()
-                    .map_err(|e| Error::io(e.to_string()))?,
-            )
-        },
-        "copy tensor to host",
-    )?;
-    resources
-        .sync_stream()
-        .map_err(|e| Error::io(e.to_string()))?;
-    Ok(array)
-}
-
-fn copy_tensor_to_host_f32_3d(
-    resources: &Resources,
-    tensor: &cuvs_sys::DLManagedTensor,
-) -> Result<(Vec<f32>, [usize; 3])> {
-    let shape = tensor_shape(tensor);
-    if shape.len() != 3 {
-        return Err(Error::io(format!(
-            "expected 3D tensor, got shape {shape:?}"
-        )));
-    }
-    let mut values = vec![0.0f32; shape.iter().product()];
-    check_cuda(
-        unsafe {
-            cuvs_sys::cudaMemcpyAsync(
-                values.as_mut_ptr() as *mut _,
-                tensor.dl_tensor.data,
-                tensor_num_bytes(tensor),
-                cuvs_sys::cudaMemcpyKind_cudaMemcpyDefault,
-                resources
-                    .get_cuda_stream()
-                    .map_err(|e| Error::io(e.to_string()))?,
-            )
-        },
-        "copy tensor to host",
-    )?;
-    resources
-        .sync_stream()
-        .map_err(|e| Error::io(e.to_string()))?;
-    Ok((values, [shape[0], shape[1], shape[2]]))
-}
-
-fn infer_dimension(dataset: &Dataset, column: &str) -> Result<usize> {
-    let field = dataset.schema().field(column).ok_or_else(|| {
-        Error::invalid_input(format!(
-            "column '{column}' does not exist in dataset schema"
-        ))
-    })?;
-    infer_vector_dim(&field.data_type())
-}
-
-fn matrix_from_vectors<'a>(vectors: &'a FixedSizeListArray) -> Result<MatrixBuffer<'a>> {
-    let dim = vectors.value_length() as usize;
-    match vectors.value_type() {
-        DataType::Float32 => {
-            let values = vectors.values().as_primitive::<Float32Type>();
-            let values: &[f32] = values.values().as_ref();
-            Ok(MatrixBuffer::Borrowed {
-                values,
-                rows: vectors.len(),
-                cols: dim,
-            })
-        }
-        DataType::Float16 => {
-            let values = vectors.values().as_primitive::<Float16Type>();
-            let data = values
-                .values()
-                .iter()
-                .map(|value| value.to_f32())
-                .collect::<Vec<_>>();
-            Ok(MatrixBuffer::Owned(
-                Array2::from_shape_vec((vectors.len(), dim), data).map_err(|error| {
-                    Error::io(format!("failed to create float16 matrix copy: {error}"))
-                })?,
-            ))
-        }
-        DataType::Float64 => {
-            let values = vectors.values().as_primitive::<Float64Type>();
-            let data = values
-                .values()
-                .iter()
-                .map(|value| *value as f32)
-                .collect::<Vec<_>>();
-            Ok(MatrixBuffer::Owned(
-                Array2::from_shape_vec((vectors.len(), dim), data).map_err(|error| {
-                    Error::io(format!("failed to create float64 matrix copy: {error}"))
-                })?,
-            ))
-        }
-        other => Err(Error::not_supported(format!(
-            "cuVS IVF_PQ currently supports float16/float32/float64 vectors, got {other}"
-        ))),
-    }
-}
-
-fn ivf_centroids_from_host(array: Array2<f32>) -> Result<FixedSizeListArray> {
-    let dim = array.ncols() as i32;
-    let values = Float32Array::from_iter_values(array.into_iter());
-    Ok(FixedSizeListArray::try_new_from_values(values, dim)?)
-}
-
-fn pq_codebook_from_host(
-    values: Vec<f32>,
-    shape: [usize; 3],
-    num_sub_vectors: usize,
-    dimension: usize,
-    num_bits: usize,
-) -> Result<FixedSizeListArray> {
-    let pq_book_size = 1usize << num_bits;
-    let subvector_dim = dimension / num_sub_vectors;
-    let expected = [num_sub_vectors, subvector_dim, pq_book_size];
-    if shape != expected {
-        return Err(Error::io(format!(
-            "cuVS returned incompatible PQ codebook shape: expected {expected:?}, got {shape:?}"
-        )));
-    }
-
-    let mut flattened = Vec::with_capacity(values.len());
-    for subspace in 0..num_sub_vectors {
-        for centroid in 0..pq_book_size {
-            for component in 0..subvector_dim {
-                let source_idx = ((subspace * subvector_dim + component) * pq_book_size) + centroid;
-                flattened.push(values[source_idx]);
-            }
-        }
-    }
-
-    Ok(FixedSizeListArray::try_new_from_values(
-        Float32Array::from(flattened),
-        subvector_dim as i32,
-    )?)
-}
-
-fn build_metadata_batch(
-    ivf_centroids: &FixedSizeListArray,
-    pq_codebook: &FixedSizeListArray,
-) -> Result<RecordBatch> {
-    let ivf_offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0i32, ivf_centroids.len() as i32]));
-    let pq_offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0i32, pq_codebook.len() as i32]));
-    let ivf_list = ListArray::new(
-        Arc::new(Field::new(
-            "_ivf_centroids_item",
-            ivf_centroids.data_type().clone(),
-            false,
-        )),
-        ivf_offsets,
-        Arc::new(ivf_centroids.clone()),
-        None,
-    );
-    let pq_list = ListArray::new(
-        Arc::new(Field::new(
-            "_pq_codebook_item",
-            pq_codebook.data_type().clone(),
-            false,
-        )),
-        pq_offsets,
-        Arc::new(pq_codebook.clone()),
-        None,
-    );
-    let schema = Arc::new(ArrowSchema::new(vec![
-        Field::new("_ivf_centroids", ivf_list.data_type().clone(), false),
-        Field::new("_pq_codebook", pq_list.data_type().clone(), false),
-    ]));
-    Ok(RecordBatch::try_new(
-        schema,
-        vec![Arc::new(ivf_list), Arc::new(pq_list)],
-    )?)
-}
-
-fn metadata_writer_options() -> Result<FileWriterOptions> {
-    Ok(FileWriterOptions {
-        format_version: Some(
-            PARTITION_ARTIFACT_FILE_VERSION
-                .parse::<LanceFileVersion>()
-                .map_err(|error| {
-                    Error::invalid_input(format!(
-                        "invalid partition artifact file version '{}': {}",
-                        PARTITION_ARTIFACT_FILE_VERSION, error
-                    ))
-                })?,
-        ),
-        ..Default::default()
-    })
-}
-
-async fn write_partition_artifact_metadata(
-    artifact_uri: &str,
-    trained: &TrainedIvfPqIndex,
-) -> Result<()> {
-    let (object_store, root_dir) = lance::io::ObjectStore::from_uri(artifact_uri)
-        .await
-        .map_err(|error| Error::io(error.to_string()))?;
-    let path = root_dir.child(PARTITION_ARTIFACT_METADATA_FILE_NAME);
-    let batch = build_metadata_batch(&trained.ivf_centroids, &trained.pq_codebook)?;
-    let mut writer = FileWriter::try_new(
-        object_store.create(&path).await?,
-        lance_core::datatypes::Schema::try_from(batch.schema().as_ref())?,
-        metadata_writer_options()?,
-    )?;
-    writer.add_schema_metadata(
-        "lance:index_build:artifact_version".to_string(),
-        "1".to_string(),
-    );
-    writer.add_schema_metadata(
-        "lance:index_build:distance_type".to_string(),
-        trained.metric_type.to_string(),
-    );
-    writer.add_schema_metadata(
-        "lance:index_build:num_partitions".to_string(),
-        trained.num_partitions.to_string(),
-    );
-    writer.add_schema_metadata(
-        "lance:index_build:num_sub_vectors".to_string(),
-        trained.num_sub_vectors.to_string(),
-    );
-    writer.add_schema_metadata(
-        "lance:index_build:num_bits".to_string(),
-        trained.num_bits.to_string(),
-    );
-    writer.add_schema_metadata(
-        "lance:index_build:dimension".to_string(),
-        trained.dimension.to_string(),
-    );
-    writer.write_batch(&batch).await?;
-    writer.finish().await?;
-    Ok(())
-}
-
-fn build_partition_batch(
-    row_ids: Arc<dyn Array>,
-    partitions: &[u32],
-    pq_codes: &[u8],
-    code_width: usize,
-) -> Result<RecordBatch> {
-    if pq_codes.len() != partitions.len() * code_width {
-        return Err(Error::io(format!(
-            "partition artifact batch expects {} PQ codes for {} rows and code width {}, got {}",
-            partitions.len() * code_width,
-            partitions.len(),
-            code_width,
-            pq_codes.len()
-        )));
-    }
-    let schema = Arc::new(ArrowSchema::new(vec![
-        Field::new(ROW_ID, DataType::UInt64, false),
-        Field::new(PART_ID_COLUMN, DataType::UInt32, false),
-        Field::new(
-            PQ_CODE_COLUMN,
-            DataType::FixedSizeList(
-                Arc::new(Field::new("item", DataType::UInt8, true)),
-                code_width as i32,
-            ),
-            true,
-        ),
-    ]));
-    let pq_codes = FixedSizeListArray::try_new_from_values(
-        UInt8Array::from_iter_values(pq_codes.iter().copied()),
-        code_width as i32,
-    )?;
-    Ok(RecordBatch::try_new(
-        schema,
-        vec![
-            row_ids,
-            Arc::new(UInt32Array::from_iter_values(partitions.iter().copied())),
-            Arc::new(pq_codes),
-        ],
-    )?)
-}
-
-fn transform_batch_loss(batch: &RecordBatch) -> f64 {
-    batch
-        .metadata()
-        .get(LOSS_METADATA_KEY)
-        .and_then(|value| value.parse::<f64>().ok())
-        .unwrap_or(0.0)
-}
-
-struct TransformSlot {
-    input_host: PinnedHostBuffer<f32>,
-    input_device: DeviceTensor<f32>,
-    labels_host: PinnedHostBuffer<u32>,
-    labels_device: DeviceTensor<u32>,
-    codes_host: PinnedHostBuffer<u8>,
-    codes_device: DeviceTensor<u8>,
-    h2d_start: CudaEvent,
-    h2d_done: CudaEvent,
-    transform_done: CudaEvent,
-    output_ready: CudaEvent,
-    row_ids: Option<Arc<dyn Array>>,
-    rows: usize,
-}
-
-impl TransformSlot {
-    fn try_new(
-        resources: &Resources,
-        max_rows: usize,
-        dimension: usize,
-        code_width: usize,
-    ) -> Result<Self> {
-        Ok(Self {
-            input_host: PinnedHostBuffer::try_new(max_rows * dimension)?,
-            input_device: DeviceTensor::try_new(resources, &[max_rows, dimension])?,
-            labels_host: PinnedHostBuffer::try_new(max_rows)?,
-            labels_device: DeviceTensor::try_new(resources, &[max_rows])?,
-            codes_host: PinnedHostBuffer::try_new(max_rows * code_width)?,
-            codes_device: DeviceTensor::try_new(resources, &[max_rows, code_width])?,
-            h2d_start: CudaEvent::try_new()?,
-            h2d_done: CudaEvent::try_new()?,
-            transform_done: CudaEvent::try_new()?,
-            output_ready: CudaEvent::try_new()?,
-            row_ids: None,
-            rows: 0,
-        })
-    }
-
-    fn has_pending_output(&self) -> bool {
-        self.row_ids.is_some()
-    }
-
-    fn launch(
-        &mut self,
-        trained: &TrainedIvfPqIndex,
-        stream: cuvs_sys::cudaStream_t,
-        row_ids: Arc<dyn Array>,
-        matrix: &[f32],
-        rows: usize,
-        dimension: usize,
-    ) -> Result<()> {
-        let code_width = trained.pq_code_width();
-        self.input_host.copy_from_slice(matrix)?;
-        self.input_device.set_shape(&[rows, dimension])?;
-        self.labels_device.set_shape(&[rows])?;
-        self.codes_device.set_shape(&[rows, code_width])?;
-        self.rows = rows;
-        self.row_ids = Some(row_ids);
-
-        self.h2d_start.record(stream)?;
-        self.input_device.copy_from_host_async(
-            &trained.resources,
-            self.input_host.prefix(rows * dimension)?,
-        )?;
-        self.h2d_done.record(stream)?;
-        check_cuvs(
-            unsafe {
-                cuvs_sys::cuvsIvfPqTransform(
-                    trained.resources.0,
-                    trained.index.raw,
-                    self.input_device.as_mut_ptr(),
-                    self.labels_device.as_mut_ptr(),
-                    self.codes_device.as_mut_ptr(),
-                )
-            },
-            "transform vectors with IVF_PQ",
-        )?;
-        self.transform_done.record(stream)?;
-        self.labels_device
-            .copy_to_host_async(&trained.resources, self.labels_host.prefix_mut(rows)?)?;
-        self.codes_device.copy_to_host_async(
-            &trained.resources,
-            self.codes_host.prefix_mut(rows * code_width)?,
-        )?;
-        self.output_ready.record(stream)?;
-        Ok(())
-    }
-
-    fn drain_to_batch(&mut self, code_width: usize) -> Result<Option<RecordBatch>> {
-        if !self.has_pending_output() {
-            return Ok(None);
-        }
-
-        self.output_ready.synchronize()?;
-        let row_ids = self
-            .row_ids
-            .take()
-            .ok_or_else(|| Error::io("transform slot is missing row ids"))?;
-        let batch = build_partition_batch(
-            row_ids,
-            self.labels_host.prefix(self.rows)?,
-            self.codes_host.prefix(self.rows * code_width)?,
-            code_width,
-        )?;
-        self.rows = 0;
-        Ok(Some(batch))
-    }
-}
-
-async fn for_each_transformed_batch<F, Fut>(
-    dataset: &Dataset,
-    column: &str,
-    trained: &TrainedIvfPqIndex,
-    batch_size: usize,
-    filter_nan: bool,
-    mut on_batch: F,
-) -> Result<()>
-where
-    F: FnMut(RecordBatch) -> Fut,
-    Fut: std::future::Future<Output = Result<()>>,
-{
-    let code_width = trained.pq_code_width();
-    let mut scanner = dataset.scan();
-    scanner.project(&[column])?;
-    if dataset
-        .schema()
-        .field(column)
-        .is_some_and(|field| field.nullable && filter_nan)
-    {
-        scanner.filter(&format!("{column} is not null"))?;
-    }
-    scanner.with_row_id();
-    scanner.batch_size(batch_size);
-    let mut stream = scanner.try_into_stream().await?;
-    let cuda_stream = trained
-        .resources
-        .get_cuda_stream()
-        .map_err(|error| Error::io(error.to_string()))?;
-    let mut slots = (0..PIPELINE_SLOTS)
-        .map(|_| {
-            TransformSlot::try_new(
-                &trained.resources,
-                batch_size,
-                trained.dimension,
-                code_width,
-            )
-        })
-        .collect::<Result<Vec<_>>>()?;
-    let mut next_slot = 0usize;
-
-    loop {
-        let Some(batch) = stream.try_next().await? else {
-            break;
-        };
-        let slot = &mut slots[next_slot];
-        if let Some(transformed) = slot.drain_to_batch(code_width)? {
-            on_batch(transformed).await?;
-        }
-
-        let vectors = vector_column_to_fsl(&batch, column)?;
-        let row_ids = batch
-            .column_by_name(ROW_ID)
-            .ok_or_else(|| Error::invalid_input(format!("transform batch is missing {ROW_ID}")))?;
-        let finite_mask = is_finite(&vectors);
-        let valid_rows = finite_mask.true_count();
-        if valid_rows == 0 {
-            continue;
-        }
-        if valid_rows != vectors.len() {
-            warn!(
-                "{} vectors are ignored during partition assignment because they are null or non-finite",
-                vectors.len() - valid_rows
-            );
-        }
-
-        let filtered_row_ids = if valid_rows == row_ids.len() {
-            row_ids.clone()
-        } else {
-            filter(row_ids.as_ref(), &finite_mask)?
-        };
-        let filtered_vectors = if valid_rows == vectors.len() {
-            vectors
-        } else {
-            let vector_column = batch.column_by_name(column).ok_or_else(|| {
-                Error::invalid_input(format!(
-                    "transform batch is missing vector column '{column}'"
-                ))
-            })?;
-            let field = batch
-                .schema()
-                .field_with_name(column)
-                .map_err(|_| {
-                    Error::invalid_input(format!(
-                        "transform batch schema is missing field '{column}'"
-                    ))
-                })?
-                .clone();
-            let filtered_vectors = filter(vector_column.as_ref(), &finite_mask)?;
-            vector_column_to_fsl(
-                &RecordBatch::try_new(
-                    Arc::new(ArrowSchema::new(vec![field])),
-                    vec![filtered_vectors],
-                )?,
-                column,
-            )?
-        };
-
-        let matrix = matrix_from_vectors(&filtered_vectors)?;
-        let matrix_view = matrix.view()?;
-        let input_slice = matrix_view
-            .as_slice_memory_order()
-            .ok_or_else(|| Error::io("transform matrix is not contiguous"))?;
-
-        slot.launch(
-            trained,
-            cuda_stream,
-            filtered_row_ids,
-            input_slice,
-            matrix.rows(),
-            matrix_view.ncols(),
-        )?;
-        next_slot = (next_slot + 1) % PIPELINE_SLOTS;
-    }
-
-    for slot in &mut slots {
-        if let Some(transformed) = slot.drain_to_batch(code_width)? {
-            on_batch(transformed).await?;
-        }
-    }
-    Ok(())
-}
-
-pub async fn train_ivf_pq(
-    dataset: &Dataset,
-    column: &str,
-    num_partitions: usize,
-    metric_type: DistanceType,
-    num_sub_vectors: usize,
-    sample_rate: usize,
-    max_iters: usize,
-    num_bits: usize,
-    filter_nan: bool,
-) -> Result<TrainedIvfPqIndex> {
-    if num_bits != 8 {
-        return Err(Error::not_supported(
-            "cuVS IVF_PQ currently supports only num_bits=8",
-        ));
-    }
-
-    let dimension = infer_dimension(dataset, column)?;
-    if dimension % num_sub_vectors != 0 {
-        return Err(Error::invalid_input(format!(
-            "cuVS IVF_PQ requires vector dimension {} to be divisible by num_sub_vectors {}",
-            dimension, num_sub_vectors
-        )));
-    }
-
-    let num_rows = dataset.count_rows(None).await?;
-    if num_rows == 0 {
-        return Err(Error::invalid_input(
-            "cuVS training requires at least one training vector",
-        ));
-    }
-    let train_rows = num_rows
-        .min((num_partitions * sample_rate).max(256 * 256))
-        .max(1);
-    let train_vectors = if filter_nan {
-        let batch = dataset.scan().project(&[column])?.try_into_batch().await?;
-        let vectors = vector_column_to_fsl(&batch, column)?;
-        let mask = is_finite(&vectors);
-        let filtered = filter(&vectors, &mask)?.as_fixed_size_list().clone();
-        filtered.slice(0, train_rows.min(filtered.len()))
-    } else {
-        let projection = dataset.schema().project(&[column])?;
-        let batch = dataset.sample(train_rows, &projection, None).await?;
-        vector_column_to_fsl(&batch, column)?
-    };
-    if train_vectors.is_empty() {
-        return Err(Error::invalid_input(
-            "cuVS training requires at least one non-null training vector",
-        ));
-    }
-
-    let matrix = matrix_from_vectors(&train_vectors)?;
-    let resources = Resources::new().map_err(|error| Error::io(error.to_string()))?;
-    let index = CuvsIvfPqIndex::try_new()?;
-    let params = create_index_params(
-        metric_type,
-        num_partitions,
-        num_sub_vectors,
-        sample_rate,
-        max_iters,
-        num_bits,
-    )?;
-    let matrix_view = matrix.view()?;
-    let mut dataset_tensor = HostTensorView::try_new::<f32>(
-        &[matrix_view.nrows(), matrix_view.ncols()],
-        matrix_view.as_ptr() as *mut std::ffi::c_void,
-    );
-
-    let build_result = check_cuvs(
-        unsafe {
-            cuvs_sys::cuvsIvfPqBuild(resources.0, params, dataset_tensor.as_mut_ptr(), index.raw)
-        },
-        "build IVF_PQ index",
-    );
-    destroy_index_params(params);
-    build_result?;
-
-    let mut centers = make_tensor_view();
-    check_cuvs(
-        unsafe { cuvs_sys::cuvsIvfPqIndexGetCenters(index.raw, centers.as_mut_ptr()) },
-        "get IVF centroids",
-    )?;
-    let ivf_centroids =
-        ivf_centroids_from_host(copy_tensor_to_host_f32_2d(&resources, &centers.tensor)?)?;
-
-    let mut pq_centers = make_tensor_view();
-    check_cuvs(
-        unsafe { cuvs_sys::cuvsIvfPqIndexGetPqCenters(index.raw, pq_centers.as_mut_ptr()) },
-        "get PQ codebook",
-    )?;
-    let (pq_codebook_values, pq_codebook_shape) =
-        copy_tensor_to_host_f32_3d(&resources, &pq_centers.tensor)?;
-    let pq_codebook = pq_codebook_from_host(
-        pq_codebook_values,
-        pq_codebook_shape,
-        num_sub_vectors,
-        dimension,
-        num_bits,
-    )?;
-
-    Ok(TrainedIvfPqIndex {
-        resources,
-        index,
-        num_partitions,
-        dimension,
-        num_sub_vectors,
-        num_bits,
-        metric_type,
-        ivf_centroids,
-        pq_codebook,
-    })
-}
-
-pub async fn assign_ivf_pq_to_artifact(
-    dataset: &Dataset,
-    column: &str,
-    trained: &TrainedIvfPqIndex,
-    artifact_uri: &str,
-    batch_size: usize,
-    filter_nan: bool,
-) -> Result<Vec<String>> {
-    let code_width = trained.pq_code_width();
-    let builder = Arc::new(tokio::sync::Mutex::new(
-        PartitionArtifactBuilder::try_new(artifact_uri, trained.num_partitions, code_width, None)
-            .await?,
-    ));
-    for_each_transformed_batch(dataset, column, trained, batch_size, filter_nan, |batch| {
-        let builder = builder.clone();
-        async move {
-            builder.lock().await.append_batch(&batch).await?;
-            Ok(())
-        }
-    })
-    .await?;
-    let mut builder = Arc::try_unwrap(builder)
-        .map_err(|_| Error::io("partition artifact builder still has outstanding references"))?
-        .into_inner();
-
-    write_partition_artifact_metadata(artifact_uri, trained).await?;
-    let mut files = builder.finish(PARTITION_ARTIFACT_METADATA_FILE_NAME, None).await?;
-    if files.len() > 1 {
-        files.insert(1, PARTITION_ARTIFACT_METADATA_FILE_NAME.to_string());
-    }
-    Ok(files)
-}

From 578f789526cb6ff14ebc431ec50930053a90964d Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Wed, 8 Apr 2026 19:52:27 +0800
Subject: [PATCH 16/21] refactor: remove in-tree cuvs integration

---
 AGENTS.md                                     |   6 -
 python/DEVELOPMENT.md                         |  16 -
 python/python/lance/cuvs.py                   | 637 ------------------
 python/python/lance/dataset.py                | 189 ++----
 python/python/lance/indices/builder.py        |  33 +-
 python/python/lance/lance/__init__.pyi        |  14 -
 python/python/tests/test_vector_index.py      | 368 +---------
 python/src/dataset.rs                         |   4 -
 python/src/file.rs                            |  77 ---
 python/src/lib.rs                             |   4 +-
 rust/lance-index/src/vector/ivf/builder.rs    |   6 -
 rust/lance/src/index/vector.rs                |   2 -
 rust/lance/src/index/vector/builder.rs        |  27 -
 .../lance/src/index/vector/encoded_dataset.rs | 370 ----------
 rust/lance/src/index/vector/ivf.rs            |  35 -
 15 files changed, 106 insertions(+), 1682 deletions(-)
 delete mode 100644 python/python/lance/cuvs.py
 delete mode 100644 rust/lance/src/index/vector/encoded_dataset.rs

diff --git a/AGENTS.md b/AGENTS.md
index ec2b3e21773..8543d23521a 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -53,12 +53,6 @@ cd test_data && docker compose up -d
 AWS_DEFAULT_REGION=us-east-1 pytest --run-integration python/tests/test_s3_ddb.py
 ```
 
-### Benchmarking Discipline
-
-- Benchmark machines must use release builds only. For Python bindings, always run `maturin develop --release` before collecting any timing data.
-- Never use `maturin develop` without `--release` on a benchmark host. If a dev-profile rebuild is needed for functional debugging, use a different machine or clearly discard all performance results collected afterwards.
-- Before trusting a benchmark result, verify the mounted benchmark volume and the active build profile.
-
 ## Coding Standards
 
 ### General
diff --git a/python/DEVELOPMENT.md b/python/DEVELOPMENT.md
index 21dba0bdddd..12c56549608 100644
--- a/python/DEVELOPMENT.md
+++ b/python/DEVELOPMENT.md
@@ -8,22 +8,6 @@ uv sync --extra tests --extra dev
 
 Add extras such as `benchmarks`, `torch`, or `geo` only when you need them. After the environment is initialized, either activate it or use `uv run ...` for commands.
 
-`accelerator="cuvs"` does not have a normal project extra today. cuVS Python
-packages are published per CUDA major version and are typically installed from
-NVIDIA's package index, for example:
-
-```shell
-uv pip install --extra-index-url https://pypi.nvidia.com cuvs-cu12
-```
-
-or:
-
-```shell
-uv pip install --extra-index-url https://pypi.nvidia.com cuvs-cu13
-```
-
-Pick the package that matches the CUDA version in your environment.
-
 `uv sync` is not just downloading Python packages here. It also builds the local `pylance` Rust extension as part of the editable environment, so the first run, cache misses, or Rust dependency changes can make it noticeably slow. This is expected; let the build finish instead of interrupting it and switching to a different environment setup.
 
 ## Building the project
diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py
deleted file mode 100644
index c32dc12b55c..00000000000
--- a/python/python/lance/cuvs.py
+++ /dev/null
@@ -1,637 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright The Lance Authors
-
-from __future__ import annotations
-
-import json
-import re
-import tempfile
-from importlib import import_module
-from typing import TYPE_CHECKING, Iterator, Tuple
-
-import pyarrow as pa
-import pyarrow.compute as pc
-
-from .file import LanceFileSession
-from .lance import PartitionArtifactBuilder
-from .dependencies import numpy as np
-from .log import LOGGER
-from .util import _normalize_metric_type
-
-if TYPE_CHECKING:
-    from pathlib import Path
-
-PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY = (
-    "lance:index_build:precomputed_encoded_partition_sizes"
-)
-PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY = (
-    "lance:index_build:precomputed_encoded_partition_fragment_ids"
-)
-PRECOMPUTED_ENCODED_TOTAL_LOSS_METADATA_KEY = (
-    "lance:index_build:precomputed_encoded_total_loss"
-)
-
-PARTITION_ARTIFACT_MANIFEST_VERSION = 1
-PARTITION_ARTIFACT_MANIFEST_FILE_NAME = "manifest.json"
-PARTITION_ARTIFACT_METADATA_FILE_NAME = "metadata.lance"
-PARTITION_ARTIFACT_PARTITIONS_DIR = "partitions"
-DEFAULT_PARTITION_ARTIFACT_BUCKETS = 256
-PARTITION_ARTIFACT_ROW_ID_COLUMN = "_rowid"
-
-def build_vector_index_on_cuvs(
-    dataset,
-    column: str,
-    metric_type: str,
-    accelerator: str,
-    num_partitions: int,
-    num_sub_vectors: int,
-    dst_dataset_uri: str | Path | None = None,
-    *,
-    sample_rate: int = 256,
-    max_iters: int = 50,
-    num_bits: int = 8,
-    batch_size: int = 1024 * 128,
-    filter_nan: bool = True,
-):
-    if dst_dataset_uri is None:
-        dst_dataset_uri = tempfile.mkdtemp()
-
-    trained_index, ivf_centroids, pq_codebook = _train_ivf_pq_index_on_cuvs(
-        dataset,
-        column,
-        num_partitions,
-        metric_type,
-        accelerator,
-        num_sub_vectors=num_sub_vectors,
-        sample_rate=sample_rate,
-        max_iters=max_iters,
-        num_bits=num_bits,
-        filter_nan=filter_nan,
-    )
-    artifact_root, artifact_files = one_pass_assign_ivf_pq_on_cuvs(
-        dataset,
-        column,
-        metric_type,
-        accelerator,
-        ivf_centroids,
-        pq_codebook,
-        trained_index=trained_index,
-        dst_dataset_uri=dst_dataset_uri,
-        batch_size=batch_size,
-        filter_nan=filter_nan,
-    )
-    return artifact_root, artifact_files, ivf_centroids, pq_codebook
-
-
-def is_cuvs_accelerator(accelerator: object) -> bool:
-    return accelerator == "cuvs"
-
-
-def _require_cuvs():
-    try:
-        return import_module("cuvs.neighbors.ivf_pq")
-    except ModuleNotFoundError as exc:
-        raise ModuleNotFoundError(
-            "accelerator='cuvs' requires cuVS Python bindings to be installed. "
-            "Install a CUDA-matched package such as 'cuvs-cu12' or 'cuvs-cu13' "
-            "from https://pypi.nvidia.com."
-        ) from exc
-
-
-def _optional_cupy():
-    try:
-        return import_module("cupy")
-    except ModuleNotFoundError:
-        return None
-
-
-def _make_progress(total: int):
-    try:
-        from tqdm.auto import tqdm
-
-        return tqdm(total=total)
-    except ModuleNotFoundError:
-
-        class _NoOpProgress:
-            def set_description(self, _description: str):
-                return None
-
-            def update(self, _count: int):
-                return None
-
-            def close(self):
-                return None
-
-        return _NoOpProgress()
-
-
-def _metric_to_cuvs(metric_type: str) -> str:
-    metric_type = _normalize_metric_type(metric_type).lower()
-    if metric_type in {"l2", "euclidean"}:
-        return "sqeuclidean"
-    if metric_type == "dot":
-        return "inner_product"
-    if metric_type == "cosine":
-        return "cosine"
-    raise ValueError(f"Metric '{metric_type}' is not supported by cuVS IVF_PQ")
-
-
-def _coerce_float_matrix(matrix: np.ndarray, *, column: str) -> np.ndarray:
-    if matrix.ndim != 2:
-        raise ValueError(
-            f"Expected a 2D training matrix for column '{column}', got {matrix.shape}"
-        )
-    if matrix.dtype == np.float64:
-        matrix = matrix.astype(np.float32)
-    elif matrix.dtype not in (np.float16, np.float32):
-        matrix = matrix.astype(np.float32)
-    return matrix
-
-
-def _column_to_numpy(table: pa.Table | pa.RecordBatch, column: str) -> np.ndarray:
-    array = table.column(column)
-    if isinstance(array, pa.ChunkedArray):
-        array = array.combine_chunks()
-    if len(array) == 0:
-        raise ValueError("cuVS training requires at least one training vector")
-
-    if pa.types.is_fixed_size_list(array.type):
-        values = array.values.to_numpy(zero_copy_only=False)
-        matrix = values.reshape(len(array), array.type.list_size)
-        return _coerce_float_matrix(matrix, column=column)
-
-    values = array.to_pylist()
-    return _coerce_float_matrix(np.asarray(values), column=column)
-
-
-def _annotate_precomputed_encoded_dataset(
-    dataset,
-    partition_sizes: list[int],
-    *,
-    total_loss: float | None = None,
-) -> None:
-    partition_fragments = [[] for _ in range(len(partition_sizes))]
-    for fragment in dataset.get_fragments():
-        fragment_partitions = set()
-        scanner = fragment.scanner(columns=["__ivf_part_id"])
-        for batch in scanner.to_batches():
-            fragment_partitions.update(
-                int(partition_id)
-                for partition_id in np.unique(
-                    batch.column("__ivf_part_id").to_numpy(zero_copy_only=False)
-                )
-            )
-        for partition_id in fragment_partitions:
-            partition_fragments[partition_id].append(int(fragment.metadata.id))
-
-    metadata = {
-        PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY: json.dumps(
-            [int(size) for size in partition_sizes]
-        ),
-        PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY: json.dumps(
-            partition_fragments
-        ),
-    }
-    if total_loss is not None:
-        metadata[PRECOMPUTED_ENCODED_TOTAL_LOSS_METADATA_KEY] = json.dumps(
-            float(total_loss)
-        )
-    dataset.update_metadata(metadata)
-
-
-def _as_numpy(array_like) -> np.ndarray:
-    if isinstance(array_like, np.ndarray):
-        return array_like
-
-    if hasattr(array_like, "copy_to_host"):
-        return np.asarray(array_like.copy_to_host())
-
-    try:
-        array = np.asarray(array_like)
-        if isinstance(array, np.ndarray):
-            return array
-    except Exception:
-        pass
-
-    if hasattr(array_like, "get"):
-        return np.asarray(array_like.get())
-
-    cupy = _optional_cupy()
-    if cupy is not None:
-        return cupy.asnumpy(array_like)
-
-    raise TypeError("Unable to convert cuVS output to numpy")
-
-
-def _normalize_artifact_root(path_or_uri: str | Path) -> str:
-    root = str(path_or_uri)
-    if re.search(r".:\\", root) is not None:
-        root = root.replace("\\", "/", 1)
-    return root
-
-
-def _make_metadata_table(
-    ivf_centroids: np.ndarray,
-    pq_codebook: np.ndarray,
-) -> pa.Table:
-    dimension = ivf_centroids.shape[1]
-    subvector_dim = pq_codebook.shape[2]
-    ivf_type = pa.list_(pa.list_(pa.float32(), dimension))
-    pq_type = pa.list_(pa.list_(pa.float32(), subvector_dim))
-    ivf_values = pa.array([ivf_centroids.tolist()], type=ivf_type)
-    pq_values = pa.array(
-        [pq_codebook.reshape(-1, subvector_dim).tolist()],
-        type=pq_type,
-    )
-    return pa.Table.from_arrays(
-        [ivf_values, pq_values],
-        names=["_ivf_centroids", "_pq_codebook"],
-    )
-
-
-def _write_partition_artifact_metadata(
-    session: LanceFileSession,
-    *,
-    ivf_centroids: np.ndarray,
-    pq_codebook: np.ndarray,
-    metric_type: str,
-    num_bits: int,
-) -> None:
-    metadata_table = _make_metadata_table(ivf_centroids, pq_codebook)
-    with session.open_writer(
-        PARTITION_ARTIFACT_METADATA_FILE_NAME,
-        schema=metadata_table.schema,
-        version="2.2",
-    ) as writer:
-        writer.add_schema_metadata("lance:index_build:artifact_version", "1")
-        writer.add_schema_metadata(
-            "lance:index_build:distance_type", _normalize_metric_type(metric_type)
-        )
-        writer.add_schema_metadata(
-            "lance:index_build:num_partitions", str(ivf_centroids.shape[0])
-        )
-        writer.add_schema_metadata(
-            "lance:index_build:num_sub_vectors", str(pq_codebook.shape[0])
-        )
-        writer.add_schema_metadata("lance:index_build:num_bits", str(num_bits))
-        writer.add_schema_metadata("lance:index_build:dimension", str(ivf_centroids.shape[1]))
-        writer.write_batch(metadata_table)
-
-
-def _write_partition_artifact(
-    batches: Iterator[pa.RecordBatch],
-    *,
-    artifact_root: str | Path,
-    ivf_centroids: np.ndarray,
-    pq_codebook: np.ndarray,
-    metric_type: str,
-    num_bits: int,
-    num_partitions: int,
-    total_loss: float | None = None,
-) -> tuple[str, list[str]]:
-    artifact_root = _normalize_artifact_root(artifact_root)
-    session = LanceFileSession(artifact_root)
-    builder = PartitionArtifactBuilder(
-        artifact_root,
-        num_partitions=num_partitions,
-        pq_code_width=pq_codebook.shape[0],
-    )
-    for batch in batches:
-        builder.append_batch(batch)
-
-    _write_partition_artifact_metadata(
-        session,
-        ivf_centroids=ivf_centroids,
-        pq_codebook=pq_codebook,
-        metric_type=metric_type,
-        num_bits=num_bits,
-    )
-    artifact_files = builder.finish(
-        PARTITION_ARTIFACT_METADATA_FILE_NAME,
-        float(total_loss) if total_loss is not None else None,
-    )
-    artifact_files.insert(1, PARTITION_ARTIFACT_METADATA_FILE_NAME)
-    return artifact_root, artifact_files
-
-
-def _to_cuvs_transform_input(matrix: np.ndarray):
-    cupy = _optional_cupy()
-    if cupy is None:
-        raise ModuleNotFoundError(
-            "accelerator='cuvs' full index build requires the 'cupy' package "
-            "to pass transform batches in device memory"
-        )
-    return cupy.asarray(matrix)
-
-
-def _normalize_centroids(index, num_partitions: int, dimension: int) -> np.ndarray:
-    centroids = _as_numpy(index.centers)
-    if centroids.shape != (num_partitions, dimension):
-        raise ValueError(
-            "cuVS returned incompatible IVF centroids shape: "
-            f"expected {(num_partitions, dimension)}, got {centroids.shape}"
-        )
-    return centroids
-
-
-def _normalize_pq_codebook(
-    index, num_sub_vectors: int, num_bits: int, dimension: int
-) -> np.ndarray:
-    pq_book_size = 1 << num_bits
-    subvector_dim = dimension // num_sub_vectors
-    pq_centers = _as_numpy(index.pq_centers)
-
-    expected_shapes = {
-        (subvector_dim, num_sub_vectors, pq_book_size): (1, 2, 0),
-        (num_sub_vectors, subvector_dim, pq_book_size): (0, 2, 1),
-        (num_sub_vectors, pq_book_size, subvector_dim): None,
-    }
-    transpose = expected_shapes.get(pq_centers.shape)
-    if transpose is None and pq_centers.shape not in expected_shapes:
-        raise ValueError(
-            "cuVS returned incompatible PQ codebook shape: expected one of "
-            f"{list(expected_shapes.keys())}, got {pq_centers.shape}"
-        )
-    if transpose is not None:
-        pq_centers = np.transpose(pq_centers, transpose)
-    return pq_centers
-
-
-def _estimate_trainset_fraction(
-    num_rows: int, num_partitions: int, sample_rate: int
-) -> float:
-    if num_rows <= 0:
-        raise ValueError("cuVS training requires a non-empty dataset")
-    desired_rows = max(num_partitions * sample_rate, 256 * 256)
-    return min(1.0, desired_rows / num_rows)
-
-
-def _sample_training_table(
-    dataset, column: str, train_rows: int, filt: str | None
-) -> pa.Table:
-    if filt is None:
-        return dataset.sample(train_rows, columns=[column], randomize_order=True)
-
-    total_rows = dataset.count_rows()
-    sample_rows = min(total_rows, max(train_rows * 2, train_rows + 1024))
-    trainset = dataset.sample(sample_rows, columns=[column], randomize_order=True)
-    trainset = trainset.filter(pc.is_valid(trainset.column(column)))
-    if len(trainset) >= train_rows or sample_rows == total_rows:
-        return trainset.slice(0, min(train_rows, len(trainset)))
-
-    return dataset.to_table(columns=[column], filter=filt, limit=train_rows)
-
-
-def _train_ivf_pq_index_on_cuvs(
-    dataset,
-    column: str,
-    num_partitions: int,
-    metric_type: str,
-    accelerator: str,
-    num_sub_vectors: int,
-    *,
-    sample_rate: int = 256,
-    max_iters: int = 50,
-    num_bits: int = 8,
-    filter_nan: bool = True,
-):
-    if accelerator != "cuvs":
-        raise ValueError("cuVS acceleration only supports accelerator='cuvs'")
-    if num_bits != 8:
-        raise ValueError("cuVS IVF_PQ integration currently supports only num_bits=8")
-
-    dimension = dataset.schema.field(column).type.list_size
-    if dimension % num_sub_vectors != 0:
-        raise ValueError(
-            "cuVS IVF_PQ integration requires vector dimension to be divisible by "
-            "num_sub_vectors"
-        )
-
-    if dataset.schema.field(column).nullable and filter_nan:
-        filt = f"{column} is not null"
-    else:
-        filt = None
-
-    num_rows = dataset.count_rows(filter=filt)
-    if num_rows == 0:
-        raise ValueError("cuVS training requires at least one non-null training vector")
-
-    train_rows = max(1, min(num_rows, max(num_partitions * sample_rate, 256 * 256)))
-    trainset = _sample_training_table(dataset, column, train_rows, filt)
-    matrix = _column_to_numpy(trainset, column)
-
-    ivf_pq = _require_cuvs()
-    build_params = ivf_pq.IndexParams(
-        n_lists=num_partitions,
-        metric=_metric_to_cuvs(metric_type),
-        kmeans_n_iters=max_iters,
-        kmeans_trainset_fraction=_estimate_trainset_fraction(
-            matrix.shape[0], num_partitions, sample_rate
-        ),
-        pq_bits=num_bits,
-        pq_dim=num_sub_vectors,
-        codebook_kind="subspace",
-        force_random_rotation=False,
-        add_data_on_build=False,
-    )
-
-    index = ivf_pq.build(build_params, matrix)
-    centroids = _normalize_centroids(index, num_partitions, dimension)
-    pq_codebook = _normalize_pq_codebook(index, num_sub_vectors, num_bits, dimension)
-    return index, centroids, pq_codebook
-
-
-def one_pass_assign_ivf_pq_on_cuvs(
-    dataset,
-    column: str,
-    metric_type: str,
-    accelerator: str,
-    ivf_centroids: np.ndarray,
-    pq_codebook: np.ndarray,
-    trained_index=None,
-    dst_dataset_uri: str | Path | None = None,
-    batch_size: int = 1024 * 128,
-    *,
-    filter_nan: bool = True,
-):
-    if accelerator != "cuvs":
-        raise ValueError("cuVS acceleration only supports accelerator='cuvs'")
-
-    num_rows = dataset.count_rows()
-    if dataset.schema.field(column).nullable and filter_nan:
-        filt = f"{column} is not null"
-    else:
-        filt = None
-
-    num_sub_vectors = pq_codebook.shape[0]
-    ivf_pq = _require_cuvs()
-
-    if trained_index is None:
-        raise ValueError(
-            "one_pass_assign_ivf_pq_on_cuvs requires a trained cuVS index for "
-            "single-node transform"
-        )
-    transform_code_width = (trained_index.pq_dim * trained_index.pq_bits + 7) // 8
-    if transform_code_width != num_sub_vectors:
-        raise ValueError(
-            "cuVS transform output is incompatible with Lance IVF_PQ for this "
-            "configuration: expected "
-            f"{num_sub_vectors} PQ code columns, but cuVS will produce "
-            f"{transform_code_width}. Use a configuration where "
-            "ceil(pq_dim * pq_bits / 8) == num_sub_vectors."
-        )
-
-    progress = _make_progress(num_rows)
-    progress.set_description("Assigning partitions and computing pq codes")
-    num_partitions = ivf_centroids.shape[0]
-    partition_sizes = np.zeros(num_partitions, dtype=np.int64)
-
-    output_schema = pa.schema(
-        [
-            pa.field(PARTITION_ARTIFACT_ROW_ID_COLUMN, pa.uint64()),
-            pa.field("__ivf_part_id", pa.uint32()),
-            pa.field("__pq_code", pa.list_(pa.uint8(), list_size=num_sub_vectors)),
-        ]
-    )
-
-    def _partition_and_pq_codes_assignment() -> Iterator[pa.RecordBatch]:
-        for batch in dataset.to_batches(
-            columns=[column],
-            filter=filt,
-            with_row_id=True,
-            batch_size=batch_size,
-        ):
-            vectors = _column_to_numpy(batch, column)
-            row_ids = batch.column("_rowid").to_numpy()
-            valid_mask = np.isfinite(vectors).all(axis=1)
-            if not np.all(valid_mask):
-                LOGGER.warning(
-                    "%s vectors are ignored during partition assignment",
-                    len(valid_mask) - int(valid_mask.sum()),
-                )
-                row_ids = row_ids[valid_mask]
-                vectors = vectors[valid_mask]
-            if len(row_ids) == 0:
-                continue
-            partitions, pq_codes = ivf_pq.transform(
-                trained_index, _to_cuvs_transform_input(vectors)
-            )
-            partitions = _as_numpy(partitions).astype(np.uint32, copy=False)
-            partition_sizes[:] += np.bincount(partitions, minlength=num_partitions)
-            pq_codes = _as_numpy(pq_codes).astype(np.uint8, copy=False)
-            if pq_codes.shape != (len(row_ids), num_sub_vectors):
-                raise ValueError(
-                    "cuVS transform returned incompatible PQ codes shape: "
-                    f"expected {(len(row_ids), num_sub_vectors)}, got {pq_codes.shape}"
-                )
-
-            pq_values = pa.array(pq_codes.reshape(-1), type=pa.uint8())
-            pq_code_array = pa.FixedSizeListArray.from_arrays(
-                pq_values, num_sub_vectors
-            )
-            yield pa.RecordBatch.from_arrays(
-                [
-                    pa.array(row_ids, type=pa.uint64()),
-                    pa.array(partitions, type=pa.uint32()),
-                    pq_code_array,
-                ],
-                schema=output_schema,
-            )
-            progress.update(len(row_ids))
-
-    if dst_dataset_uri is None:
-        dst_dataset_uri = tempfile.mkdtemp()
-    artifact_root, artifact_files = _write_partition_artifact(
-        _partition_and_pq_codes_assignment(),
-        artifact_root=dst_dataset_uri,
-        ivf_centroids=ivf_centroids,
-        pq_codebook=pq_codebook,
-        metric_type=metric_type,
-        num_bits=8,
-        num_partitions=num_partitions,
-    )
-
-    progress.close()
-    LOGGER.info("Saved precomputed partition artifact to %s", artifact_root)
-    return str(artifact_root), artifact_files
-
-
-def train_ivf_pq_on_cuvs(
-    dataset,
-    column: str,
-    num_partitions: int,
-    metric_type: str,
-    accelerator: str,
-    num_sub_vectors: int,
-    *,
-    sample_rate: int = 256,
-    max_iters: int = 50,
-    num_bits: int = 8,
-    filter_nan: bool = True,
-) -> Tuple[np.ndarray, np.ndarray]:
-    _, centroids, pq_codebook = _train_ivf_pq_index_on_cuvs(
-        dataset,
-        column,
-        num_partitions,
-        metric_type,
-        accelerator,
-        num_sub_vectors,
-        sample_rate=sample_rate,
-        max_iters=max_iters,
-        num_bits=num_bits,
-        filter_nan=filter_nan,
-    )
-    return centroids, pq_codebook
-
-
-def one_pass_train_ivf_pq_on_cuvs(
-    dataset,
-    column: str,
-    num_partitions: int,
-    metric_type: str,
-    accelerator: str,
-    num_sub_vectors: int,
-    *,
-    sample_rate: int = 256,
-    max_iters: int = 50,
-    num_bits: int = 8,
-    filter_nan: bool = True,
-):
-    return train_ivf_pq_on_cuvs(
-        dataset,
-        column,
-        num_partitions,
-        metric_type,
-        accelerator,
-        num_sub_vectors,
-        sample_rate=sample_rate,
-        max_iters=max_iters,
-        num_bits=num_bits,
-        filter_nan=filter_nan,
-    )
-
-
-def prepare_global_ivf_pq_on_cuvs(
-    dataset,
-    column: str,
-    num_partitions: int,
-    num_sub_vectors: int,
-    *,
-    distance_type: str = "l2",
-    accelerator: str = "cuvs",
-    sample_rate: int = 256,
-    max_iters: int = 50,
-    num_bits: int = 8,
-):
-    centroids, pq_codebook = train_ivf_pq_on_cuvs(
-        dataset,
-        column,
-        num_partitions,
-        distance_type,
-        accelerator,
-        num_sub_vectors,
-        sample_rate=sample_rate,
-        max_iters=max_iters,
-        num_bits=num_bits,
-    )
-    return {"ivf_centroids": centroids, "pq_codebook": pq_codebook}
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
index ee023048a1e..32e9e548d68 100644
--- a/python/python/lance/dataset.py
+++ b/python/python/lance/dataset.py
@@ -39,7 +39,6 @@
 from lance.log import LOGGER
 
 from .blob import BlobFile
-from .cuvs import is_cuvs_accelerator
 from .dependencies import (
     _check_for_numpy,
     _check_for_torch,
@@ -2900,24 +2899,27 @@ def _create_index_impl(
 
         # Handle timing for various parts of accelerated builds
         timers = {}
-        use_cuvs = is_cuvs_accelerator(accelerator)
+        if isinstance(accelerator, str) and accelerator.lower().startswith("cuvs"):
+            raise ValueError(
+                "accelerator='cuvs' is not built into Lance. "
+                "Use the external 'lance-cuvs' package to produce a "
+                "precomputed partition artifact and then call create_index "
+                "with precomputed_partition_artifact_uri."
+            )
         if accelerator is not None and index_type != "IVF_PQ":
-            if use_cuvs:
-                raise ValueError(
-                    f"accelerator='{accelerator}' only supports IVF_PQ index builds"
-                )
             LOGGER.warning(
                 "Index type %s does not support GPU acceleration; falling back to CPU",
                 index_type,
             )
             accelerator = None
-            use_cuvs = False
 
         # IMPORTANT: Distributed indexing is CPU-only. Enforce single-node when
-        # any Python-side accelerator path is selected.
-        accelerated_build_detected = accelerator is not None
+        # accelerator or torch-related paths are detected.
+        torch_detected = False
         try:
-            if accelerator is None:
+            if accelerator is not None:
+                torch_detected = True
+            else:
                 impl = kwargs.get("implementation")
                 use_torch_flag = kwargs.get("use_torch") is True
                 one_pass_flag = kwargs.get("one_pass_ivfpq") is True
@@ -2930,16 +2932,16 @@ def _create_index_impl(
                     or torch_centroids
                     or torch_codebook
                 ):
-                    accelerated_build_detected = True
+                    torch_detected = True
         except Exception:
             # Be conservative: if detection fails, do not modify behavior
             pass
 
-        if accelerated_build_detected:
+        if torch_detected:
             if require_commit:
                 if fragment_ids is not None or index_uuid is not None:
                     LOGGER.info(
-                        "Accelerated build detected; "
+                        "Torch detected; "
                         "enforce single-node indexing (distributed is CPU-only)."
                     )
                 fragment_ids = None
@@ -2947,7 +2949,7 @@ def _create_index_impl(
             else:
                 if index_uuid is not None:
                     LOGGER.info(
-                        "Accelerated build detected; "
+                        "Torch detected; "
                         "enforce single-node indexing (distributed is CPU-only)."
                     )
                 index_uuid = None
@@ -2958,83 +2960,52 @@ def _create_index_impl(
                 num_partitions = _target_partition_size_to_num_partitions(
                     num_rows, target_partition_size
                 )
+            from .vector import (
+                one_pass_assign_ivf_pq_on_accelerator,
+                one_pass_train_ivf_pq_on_accelerator,
+            )
 
-            if use_cuvs:
-                from .cuvs import build_vector_index_on_cuvs
-
-                LOGGER.info("Doing cuVS vector backend build")
-                timers["ivf+pq_build:start"] = time.time()
-                artifact_root, _, ivf_centroids, pq_codebook = build_vector_index_on_cuvs(
-                    self,
-                    column[0],
-                    metric,
-                    accelerator,
-                    num_partitions,
-                    num_sub_vectors,
-                    sample_rate=kwargs.get("sample_rate", 256),
-                    max_iters=kwargs.get("max_iters", 50),
-                    num_bits=kwargs.get("num_bits", 8),
-                    batch_size=1024 * 128,
-                    filter_nan=filter_nan,
-                )
-                kwargs["precomputed_partition_artifact_uri"] = artifact_root
-                timers["ivf+pq_build:end"] = time.time()
-                ivfpq_build_time = (
-                    timers["ivf+pq_build:end"] - timers["ivf+pq_build:start"]
-                )
-                LOGGER.info("cuVS ivf+pq build time: %ss", ivfpq_build_time)
-            else:
-                from .vector import (
-                    one_pass_assign_ivf_pq_on_accelerator,
-                    one_pass_train_ivf_pq_on_accelerator,
-                )
-
-                LOGGER.info("Doing one-pass ivfpq accelerated computations")
-                timers["ivf+pq_train:start"] = time.time()
-                (
-                    ivf_centroids,
-                    ivf_kmeans,
-                    pq_codebook,
-                    pq_kmeans_list,
-                ) = one_pass_train_ivf_pq_on_accelerator(
-                    self,
-                    column[0],
-                    num_partitions,
-                    metric,
-                    accelerator,
-                    num_sub_vectors=num_sub_vectors,
-                    batch_size=20480,
-                    filter_nan=filter_nan,
-                )
-                timers["ivf+pq_train:end"] = time.time()
-                ivfpq_train_time = (
-                    timers["ivf+pq_train:end"] - timers["ivf+pq_train:start"]
-                )
-                LOGGER.info("ivf+pq training time: %ss", ivfpq_train_time)
-                timers["ivf+pq_assign:start"] = time.time()
-                (
-                    shuffle_output_dir,
-                    shuffle_buffers,
-                ) = one_pass_assign_ivf_pq_on_accelerator(
-                    self,
-                    column[0],
-                    metric,
-                    accelerator,
-                    ivf_kmeans,
-                    pq_kmeans_list,
-                    batch_size=20480,
-                    filter_nan=filter_nan,
-                )
-                timers["ivf+pq_assign:end"] = time.time()
-                ivfpq_assign_time = (
-                    timers["ivf+pq_assign:end"] - timers["ivf+pq_assign:start"]
-                )
-                LOGGER.info("ivf+pq transform time: %ss", ivfpq_assign_time)
+            LOGGER.info("Doing one-pass ivfpq accelerated computations")
+            timers["ivf+pq_train:start"] = time.time()
+            (
+                ivf_centroids,
+                ivf_kmeans,
+                pq_codebook,
+                pq_kmeans_list,
+            ) = one_pass_train_ivf_pq_on_accelerator(
+                self,
+                column[0],
+                num_partitions,
+                metric,
+                accelerator,
+                num_sub_vectors=num_sub_vectors,
+                batch_size=20480,
+                filter_nan=filter_nan,
+            )
+            timers["ivf+pq_train:end"] = time.time()
+            ivfpq_train_time = timers["ivf+pq_train:end"] - timers["ivf+pq_train:start"]
+            LOGGER.info("ivf+pq training time: %ss", ivfpq_train_time)
+            timers["ivf+pq_assign:start"] = time.time()
+            shuffle_output_dir, shuffle_buffers = one_pass_assign_ivf_pq_on_accelerator(
+                self,
+                column[0],
+                metric,
+                accelerator,
+                ivf_kmeans,
+                pq_kmeans_list,
+                batch_size=20480,
+                filter_nan=filter_nan,
+            )
+            timers["ivf+pq_assign:end"] = time.time()
+            ivfpq_assign_time = (
+                timers["ivf+pq_assign:end"] - timers["ivf+pq_assign:start"]
+            )
+            LOGGER.info("ivf+pq transform time: %ss", ivfpq_assign_time)
 
-                kwargs["precomputed_shuffle_buffers"] = shuffle_buffers
-                kwargs["precomputed_shuffle_buffers_path"] = os.path.join(
-                    shuffle_output_dir, "data"
-                )
+            kwargs["precomputed_shuffle_buffers"] = shuffle_buffers
+            kwargs["precomputed_shuffle_buffers_path"] = os.path.join(
+                shuffle_output_dir, "data"
+            )
         if index_type.startswith("IVF"):
             if (ivf_centroids is not None) and (ivf_centroids_file is not None):
                 raise ValueError(
@@ -3190,13 +3161,6 @@ def _create_index_impl(
                 "Temporary shuffle buffers stored at %s, you may want to delete it.",
                 kwargs["precomputed_shuffle_buffers_path"],
             )
-        if "precomputed_encoded_dataset_uri" in kwargs.keys() and os.path.exists(
-            kwargs["precomputed_encoded_dataset_uri"]
-        ):
-            LOGGER.info(
-                "Temporary precomputed encoded dataset stored at %s, you may want to delete it.",
-                kwargs["precomputed_encoded_dataset_uri"],
-            )
         if "precomputed_partition_artifact_uri" in kwargs.keys() and os.path.exists(
             kwargs["precomputed_partition_artifact_uri"]
         ):
@@ -3279,17 +3243,7 @@ def create_index(
             The number of sub-vectors for PQ (Product Quantization).
         accelerator : str or ``torch.Device``, optional
             If set, use an accelerator to speed up the training process.
-            Accepted accelerator:
-
-            - "cuda" or ``torch.device(...)`` for the existing torch-based path
-              on NVIDIA GPUs
-            - "mps" for Apple Silicon GPU
-            - "cuvs" for the explicit cuVS-based IVF_PQ training path on NVIDIA
-              GPUs
-
-            The cuVS path also requires the cuVS Python bindings to be installed
-            separately.
-
+            Accepted accelerator: "cuda" (Nvidia GPU) and "mps" (Apple Silicon GPU).
             If not set, use the CPU.
         index_cache_size : int, optional
             The size of the index cache in number of entries. Default value is 256.
@@ -3355,6 +3309,11 @@ def create_index(
                 Only 4, 8 are supported.
             - index_file_version
                 The version of the index file. Default is "V3".
+            - precomputed_partition_artifact_uri
+                An advanced input produced by an external backend such as
+                `lance-cuvs`. When set, Lance skips its own partition assignment
+                and consumes the precomputed partition-local artifact during
+                finalization. Requires `ivf_centroids` and `pq_codebook`.
 
         Optional parameters for `IVF_RQ`:
 
@@ -3398,13 +3357,8 @@ def create_index(
         Experimental Accelerator (GPU) support:
 
         - *accelerate*: use GPU to train IVF partitions.
-            `accelerator="cuda"` and `accelerator="mps"` use the existing torch
-            path. `accelerator="cuda"` runs on NVIDIA GPUs and `accelerator="mps"`
-            runs on Apple Silicon GPUs. `accelerator="cuvs"` uses cuVS for IVF_PQ
-            training only and requires an NVIDIA GPU.
-
-            The torch path requires PyTorch. The cuVS path requires the cuVS
-            Python bindings to be installed separately.
+            Only supports CUDA (Nvidia) or MPS (Apple) currently.
+            Requires PyTorch being installed.
 
         .. code-block:: python
 
@@ -3419,10 +3373,9 @@ def create_index(
                 accelerator="cuda"
             )
 
-        Note: accelerator support is currently limited to the ``IVF_PQ`` index type.
-        Providing ``accelerator="cuda"`` for other index types will fall back to CPU
-        index building. Providing ``accelerator="cuvs"`` for other index types will
-        raise an error.
+        Note: GPU acceleration is currently supported only for the ``IVF_PQ`` index
+        type. Providing an accelerator for other index types will fall back to CPU
+        index building.
 
         References
         ----------
diff --git a/python/python/lance/indices/builder.py b/python/python/lance/indices/builder.py
index 00591ead934..a13e92faf8f 100644
--- a/python/python/lance/indices/builder.py
+++ b/python/python/lance/indices/builder.py
@@ -8,8 +8,6 @@
 
 import numpy as np
 import pyarrow as pa
-
-from lance.cuvs import is_cuvs_accelerator, prepare_global_ivf_pq_on_cuvs
 from lance.indices.ivf import IvfModel
 from lance.indices.pq import PqModel
 
@@ -116,10 +114,11 @@ def train_ivf(
         self._verify_ivf_sample_rate(sample_rate, num_partitions, num_rows)
         distance_type = self._normalize_distance_type(distance_type)
         self._verify_ivf_params(num_partitions)
-        if is_cuvs_accelerator(accelerator):
-            raise NotImplementedError(
-                "IndicesBuilder.train_ivf does not support accelerator='cuvs'; "
-                "use prepare_global_ivf_pq instead"
+        if isinstance(accelerator, str) and accelerator.lower().startswith("cuvs"):
+            raise ValueError(
+                "accelerator='cuvs' is not built into Lance. "
+                "Use the external 'lance-cuvs' package to build training outputs "
+                "and partition artifacts."
             )
 
         if accelerator is None:
@@ -256,23 +255,11 @@ def prepare_global_ivf_pq(
         `IndicesBuilder.train_pq` (indices.train_pq_model). No public method
         names elsewhere are changed.
         """
-        if is_cuvs_accelerator(accelerator):
-            if fragment_ids is not None:
-                raise NotImplementedError(
-                    "fragment_ids is not supported with accelerator='cuvs'"
-                )
-            num_rows = self._count_rows()
-            num_partitions = self._determine_num_partitions(num_partitions, num_rows)
-            num_subvectors = self._normalize_pq_params(num_subvectors, self.dimension)
-            return prepare_global_ivf_pq_on_cuvs(
-                self.dataset,
-                self.column[0],
-                num_partitions,
-                num_subvectors,
-                distance_type=distance_type,
-                accelerator=accelerator,
-                sample_rate=sample_rate,
-                max_iters=max_iters,
+        if isinstance(accelerator, str) and accelerator.lower().startswith("cuvs"):
+            raise ValueError(
+                "accelerator='cuvs' is not built into Lance. "
+                "Use the external 'lance-cuvs' package to build training outputs "
+                "and partition artifacts."
             )
 
         # Global IVF training
diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi
index d377f381246..f0be29f39ca 100644
--- a/python/python/lance/lance/__init__.pyi
+++ b/python/python/lance/lance/__init__.pyi
@@ -135,20 +135,6 @@ class LanceFileSession:
     def upload_file(self, local_path: str, remote_path: str) -> None: ...
     def download_file(self, remote_path: str, local_path: str) -> None: ...
 
-class PartitionArtifactBuilder:
-    def __init__(
-        self,
-        uri_or_path: str,
-        num_partitions: int,
-        pq_code_width: int,
-        storage_options: Optional[Dict[str, str]] = None,
-        storage_options_provider: Optional[StorageOptionsProvider] = None,
-    ): ...
-    def append_batch(self, batch: pa.RecordBatch) -> None: ...
-    def finish(
-        self, metadata_file: str, total_loss: Optional[float] = None
-    ) -> List[str]: ...
-
 class LanceFileReader:
     def __init__(
         self,
diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py
index 9aba519fa9c..9606c91a724 100644
--- a/python/python/tests/test_vector_index.py
+++ b/python/python/tests/test_vector_index.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright The Lance Authors
 
 import logging
-import json
 import os
 import platform
 import random
@@ -10,29 +9,21 @@
 import string
 import tempfile
 import time
-from importlib import import_module
-from pathlib import Path
 from typing import Optional
 
 import lance
-import lance.cuvs as lance_cuvs
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
 import pytest
 from lance import LanceDataset, LanceFragment
 from lance.dataset import VectorIndexReader
-from lance.file import LanceFileReader
 from lance.indices import IndexFileVersion, IndicesBuilder
 from lance.query import MatchQuery, PhraseQuery
 from lance.util import validate_vector_index  # noqa: E402
 from lance.vector import vec_to_table  # noqa: E402
 
 
-def _disable_rust_cuvs_backend(monkeypatch):
-    del monkeypatch
-
-
 def create_table(nvec=1000, ndim=128, nans=0, nullify=False, dtype=np.float32):
     mat = np.random.randn(nvec, ndim)
     if nans > 0:
@@ -543,357 +534,46 @@ def test_create_index_accelerator_fallback(tmp_path, caplog):
     )
 
 
-def test_create_index_cuvs_dispatch(tmp_path, monkeypatch):
-    tbl = create_table(nvec=512, ndim=128)
-    dataset = lance.write_dataset(tbl, tmp_path)
-    calls = {}
-
-    def fake_build(
-        dataset_arg,
-        column,
-        metric_type,
-        accelerator,
-        num_partitions,
-        num_sub_vectors,
-        **kwargs,
-    ):
-        calls["dataset"] = dataset_arg
-        calls["column"] = column
-        calls["num_partitions"] = num_partitions
-        calls["metric_type"] = metric_type
-        calls["accelerator"] = accelerator
-        calls["num_sub_vectors"] = num_sub_vectors
-        calls["kwargs"] = kwargs
-        return str(tmp_path / "cuvs_artifact"), [
-            "manifest.json",
-            "metadata.lance",
-            "partitions/bucket-00000.lance",
-        ], np.random.randn(num_partitions, 128).astype(np.float32), np.random.randn(
-            num_sub_vectors, 256, 128 // num_sub_vectors
-        ).astype(np.float32)
-
-    monkeypatch.setattr(lance_cuvs, "build_vector_index_on_cuvs", fake_build)
-
-    dataset = dataset.create_index(
-        "vector",
-        index_type="IVF_PQ",
-        num_partitions=4,
-        num_sub_vectors=16,
-        accelerator="cuvs",
-    )
-
-    assert calls["column"] == "vector"
-    assert calls["num_partitions"] == 4
-    assert calls["metric_type"] == "L2"
-    assert calls["accelerator"] == "cuvs"
-    assert calls["num_sub_vectors"] == 16
-    assert calls["kwargs"]["sample_rate"] == 256
-    assert calls["kwargs"]["max_iters"] == 50
-    assert calls["kwargs"]["num_bits"] == 8
-    assert calls["kwargs"]["batch_size"] == 1024 * 128
-    assert calls["kwargs"]["filter_nan"] is True
-    assert dataset.stats.index_stats("vector_idx")["index_type"] == "IVF_PQ"
-
-
-def test_create_index_cuvs_rejects_non_ivf_pq(tmp_path):
+def test_create_index_rejects_cuvs_accelerator(tmp_path):
     tbl = create_table()
     dataset = lance.write_dataset(tbl, tmp_path)
 
-    with pytest.raises(ValueError, match="only supports IVF_PQ"):
+    with pytest.raises(ValueError, match="not built into Lance"):
         dataset.create_index(
             "vector",
-            index_type="IVF_FLAT",
+            index_type="IVF_PQ",
             num_partitions=4,
+            num_sub_vectors=16,
             accelerator="cuvs",
         )
 
 
-def test_prepare_global_ivf_pq_cuvs_dispatch(tmp_path, monkeypatch):
-    ds = _make_sample_dataset_base(tmp_path, "cuvs_prepare_ds", 512, 128)
+def test_prepare_global_ivf_pq_rejects_cuvs_accelerator(tmp_path):
+    ds = _make_sample_dataset_base(tmp_path, "prepare_ivf_pq_cuvs_ds", 512, 128)
     builder = IndicesBuilder(ds, "vector")
-    builder_module = import_module("lance.indices.builder")
-    calls = {}
-
-    def fake_prepare(
-        dataset_arg,
-        column,
-        num_partitions,
-        num_sub_vectors,
-        *,
-        distance_type,
-        accelerator,
-        sample_rate,
-        max_iters,
-    ):
-        calls["dataset"] = dataset_arg
-        calls["column"] = column
-        calls["num_partitions"] = num_partitions
-        calls["num_sub_vectors"] = num_sub_vectors
-        calls["distance_type"] = distance_type
-        calls["accelerator"] = accelerator
-        calls["sample_rate"] = sample_rate
-        calls["max_iters"] = max_iters
-        return {
-            "ivf_centroids": np.random.randn(num_partitions, 128).astype(np.float32),
-            "pq_codebook": np.random.randn(
-                num_sub_vectors, 256, 128 // num_sub_vectors
-            ).astype(np.float32),
-        }
-
-    monkeypatch.setattr(builder_module, "prepare_global_ivf_pq_on_cuvs", fake_prepare)
-
-    prepared = builder.prepare_global_ivf_pq(
-        num_partitions=4,
-        num_subvectors=16,
-        distance_type="l2",
-        accelerator="cuvs",
-        sample_rate=7,
-        max_iters=20,
-    )
-
-    assert calls["column"] == "vector"
-    assert calls["num_partitions"] == 4
-    assert calls["num_sub_vectors"] == 16
-    assert calls["distance_type"] == "l2"
-    assert calls["accelerator"] == "cuvs"
-    assert prepared["ivf_centroids"].shape == (4, 128)
-    assert prepared["pq_codebook"].shape == (16, 256, 8)
-
-
-def test_train_ivf_pq_on_cuvs_nullable_vectors(tmp_path, monkeypatch):
-    _disable_rust_cuvs_backend(monkeypatch)
-    tbl = create_table(nvec=32, ndim=16, nullify=True)
-    dataset = lance.write_dataset(tbl, tmp_path)
-
-    class FakeIndex:
-        centers = np.random.randn(4, 16).astype(np.float32)
-        pq_centers = np.random.randn(4, 256, 4).astype(np.float32)
-
-    class FakeIvfPqModule:
-        class IndexParams:
-            def __init__(self, **kwargs):
-                self.kwargs = kwargs
-
-        @staticmethod
-        def build(build_params, matrix):
-            assert build_params.kwargs["n_lists"] == 4
-            assert matrix.shape[1] == 16
-            assert matrix.dtype == np.float32
-            return FakeIndex()
-
-    monkeypatch.setattr(lance_cuvs, "_require_cuvs", lambda: FakeIvfPqModule())
-
-    centroids, pq_codebook = lance_cuvs.train_ivf_pq_on_cuvs(
-        dataset,
-        "vector",
-        4,
-        "L2",
-        "cuvs",
-        4,
-        sample_rate=4,
-    )
-
-    assert centroids.shape == (4, 16)
-    assert pq_codebook.shape == (4, 256, 4)
-
-
-def test_train_ivf_pq_on_cuvs_uses_num_sub_vectors_for_pq_dim(
-    tmp_path, monkeypatch
-):
-    _disable_rust_cuvs_backend(monkeypatch)
-    dataset = lance.write_dataset(create_table(nvec=32, ndim=16), tmp_path)
-    calls = {}
-
-    class FakeIndex:
-        centers = np.random.randn(4, 16).astype(np.float32)
-        pq_centers = np.random.randn(2, 256, 8).astype(np.float32)
-
-    class FakeIvfPqModule:
-        class IndexParams:
-            def __init__(self, **kwargs):
-                calls.update(kwargs)
-
-        @staticmethod
-        def build(build_params, matrix):
-            assert matrix.shape[1] == 16
-            return FakeIndex()
-
-    monkeypatch.setattr(lance_cuvs, "_require_cuvs", lambda: FakeIvfPqModule())
-
-    centroids, pq_codebook = lance_cuvs.train_ivf_pq_on_cuvs(
-        dataset,
-        "vector",
-        4,
-        "l2",
-        "cuvs",
-        2,
-        sample_rate=4,
-    )
-
-    assert calls["pq_dim"] == 2
-    assert centroids.shape == (4, 16)
-    assert pq_codebook.shape == (2, 256, 8)
-
-
-def test_normalize_pq_codebook_accepts_subvector_dim_first_layout():
-    class FakeIndex:
-        pq_centers = np.random.randn(8, 16, 256).astype(np.float32)
-
-    pq_codebook = lance_cuvs._normalize_pq_codebook(
-        FakeIndex(), num_sub_vectors=16, num_bits=8, dimension=128
-    )
-
-    assert pq_codebook.shape == (16, 256, 8)
-
-
-def test_cuvs_as_numpy_prefers_copy_to_host():
-    class FakeDeviceTensor:
-        def copy_to_host(self):
-            return np.arange(6, dtype=np.float32).reshape(2, 3)
-
-    array = lance_cuvs._as_numpy(FakeDeviceTensor())
-
-    assert isinstance(array, np.ndarray)
-    assert array.shape == (2, 3)
-    assert array.dtype == np.float32
-
-
-def test_annotate_precomputed_encoded_dataset_scans_fragment_directly(tmp_path):
-    dataset_uri = tmp_path / "encoded_dataset"
-
-    def make_table(partition_ids: list[int], row_id_start: int):
-        part_ids = np.asarray(partition_ids, dtype=np.uint32)
-        row_ids = pa.array(
-            np.arange(row_id_start, row_id_start + len(partition_ids), dtype=np.uint64)
-        )
-        pq_values = pa.array(np.zeros(len(partition_ids) * 4, dtype=np.uint8))
-        pq_codes = pa.FixedSizeListArray.from_arrays(pq_values, 4)
-        return pa.Table.from_arrays(
-            [row_ids, pa.array(part_ids), pq_codes],
-            names=["row_id", "__ivf_part_id", "__pq_code"],
+    with pytest.raises(ValueError, match="not built into Lance"):
+        builder.prepare_global_ivf_pq(
+            num_partitions=4,
+            num_subvectors=16,
+            distance_type="l2",
+            accelerator="cuvs",
+            sample_rate=7,
+            max_iters=20,
         )
 
-    ds = lance.write_dataset(make_table([0, 1, 1, 0], 0), dataset_uri)
-    ds = lance.write_dataset(make_table([2, 3, 2, 3], 4), dataset_uri, mode="append")
-
-    lance_cuvs._annotate_precomputed_encoded_dataset(ds, [2, 2, 2, 2])
-
-    metadata = ds.metadata()
-    partition_fragments = json.loads(
-        metadata[
-            lance_cuvs.PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY
-        ]
-    )
-    assert partition_fragments == [[0], [0], [1], [1]]
-
-
-def test_one_pass_assign_ivf_pq_on_cuvs_writes_partition_artifact(tmp_path, monkeypatch):
-    _disable_rust_cuvs_backend(monkeypatch)
-    tbl = create_table(nvec=32, ndim=16)
-    dataset = lance.write_dataset(tbl, tmp_path / "cuvs_assign_src")
-
-    ivf_centroids = np.random.randn(4, 16).astype(np.float32)
-    pq_codebook = np.random.randn(4, 256, 4).astype(np.float32)
-
-    class FakeDeviceTensor:
-        def __init__(self, array):
-            self._array = array
 
-        def copy_to_host(self):
-            return self._array
+def test_create_index_rejects_missing_precomputed_partition_artifact(tmp_path):
+    dataset = lance.write_dataset(create_table(nvec=64, ndim=128), tmp_path / "artifact_src")
 
-    class FakeCupyArray:
-        def __init__(self, array):
-            self.array = array
-
-    class FakeCupyModule:
-        @staticmethod
-        def asarray(array):
-            return FakeCupyArray(array)
-
-    class FakeIndex:
-        pq_dim = 4
-        pq_bits = 8
-
-    class FakeIvfPqModule:
-        @staticmethod
-        def transform(index, vectors):
-            assert isinstance(index, FakeIndex)
-            assert isinstance(vectors, FakeCupyArray)
-            labels = np.arange(len(vectors.array), dtype=np.uint32) % 4
-            pq_codes = np.full((len(vectors.array), 4), 7, dtype=np.uint8)
-            return FakeDeviceTensor(labels), FakeDeviceTensor(pq_codes)
-
-    monkeypatch.setattr(lance_cuvs, "_require_cuvs", lambda: FakeIvfPqModule())
-    monkeypatch.setattr(lance_cuvs, "_optional_cupy", lambda: FakeCupyModule())
-
-    artifact_root, artifact_files = lance_cuvs.one_pass_assign_ivf_pq_on_cuvs(
-        dataset,
-        "vector",
-        "l2",
-        "cuvs",
-        ivf_centroids,
-        pq_codebook,
-        trained_index=FakeIndex(),
-        batch_size=8,
-    )
-
-    manifest_path = Path(artifact_root) / lance_cuvs.PARTITION_ARTIFACT_MANIFEST_FILE_NAME
-    metadata_path = Path(artifact_root) / lance_cuvs.PARTITION_ARTIFACT_METADATA_FILE_NAME
-
-    assert manifest_path.exists()
-    assert metadata_path.exists()
-    assert any(path.endswith(".lance") for path in artifact_files)
-
-    manifest = json.loads(manifest_path.read_text())
-    assert manifest["version"] == lance_cuvs.PARTITION_ARTIFACT_MANIFEST_VERSION
-    assert manifest["num_partitions"] == 4
-    assert manifest["metadata_file"] == lance_cuvs.PARTITION_ARTIFACT_METADATA_FILE_NAME
-    assert [entry["num_rows"] for entry in manifest["partitions"]] == [8, 8, 8, 8]
-    assert all(entry["path"] for entry in manifest["partitions"])
-    assert all(entry["ranges"] for entry in manifest["partitions"])
-
-    metadata_reader = LanceFileReader(str(metadata_path))
-    metadata_table = metadata_reader.read_all().to_table()
-    assert metadata_table.column("_ivf_centroids").type == pa.list_(pa.list_(pa.float32(), 16))
-    assert metadata_table.column("_pq_codebook").type == pa.list_(pa.list_(pa.float32(), 4))
-
-    bucket_path = Path(artifact_root) / manifest["partitions"][0]["path"]
-    bucket_reader = LanceFileReader(str(bucket_path))
-    bucket_table = bucket_reader.read_all().to_table()
-    assert bucket_table.column("_rowid").type == pa.uint64()
-    assert bucket_table.column("__pq_code").type == pa.list_(pa.uint8(), 4)
-
-
-def test_one_pass_assign_ivf_pq_on_cuvs_rejects_incompatible_transform_width(
-    tmp_path,
-    monkeypatch,
-):
-    _disable_rust_cuvs_backend(monkeypatch)
-    tbl = create_table(nvec=32, ndim=128)
-    dataset = lance.write_dataset(tbl, tmp_path / "cuvs_assign_incompatible")
-
-    ivf_centroids = np.random.randn(4, 128).astype(np.float32)
-    pq_codebook = np.random.randn(16, 256, 8).astype(np.float32)
-    monkeypatch.setattr(lance_cuvs, "_require_cuvs", lambda: object())
-
-    class FakeIndex:
-        pq_dim = 8
-        pq_bits = 8
-
-    with pytest.raises(
-        ValueError,
-        match="cuVS transform output is incompatible with Lance IVF_PQ",
-    ):
-        lance_cuvs.one_pass_assign_ivf_pq_on_cuvs(
-            dataset,
+    with pytest.raises(Exception):
+        dataset.create_index(
             "vector",
-            "l2",
-            "cuvs",
-            ivf_centroids,
-            pq_codebook,
-            trained_index=FakeIndex(),
-            batch_size=8,
+            index_type="IVF_PQ",
+            num_partitions=4,
+            num_sub_vectors=16,
+            ivf_centroids=np.random.randn(4, 128).astype(np.float32),
+            pq_codebook=np.random.randn(16, 256, 8).astype(np.float32),
+            precomputed_partition_artifact_uri=str(tmp_path / "missing_artifact"),
         )
 
 
diff --git a/python/src/dataset.rs b/python/src/dataset.rs
index 19c3e4ec5d4..0a0342019db 100644
--- a/python/src/dataset.rs
+++ b/python/src/dataset.rs
@@ -3371,10 +3371,6 @@ fn prepare_vector_index_params(
             ivf_params.precomputed_partitions_file = Some(f.to_string());
         };
 
-        if let Some(uri) = kwargs.get_item("precomputed_encoded_dataset_uri")? {
-            ivf_params.precomputed_encoded_dataset_uri = Some(uri.to_string());
-        };
-
         if let Some(uri) = kwargs.get_item("precomputed_partition_artifact_uri")? {
             ivf_params.precomputed_partition_artifact_uri = Some(uri.to_string());
         };
diff --git a/python/src/file.rs b/python/src/file.rs
index eb830dc4a73..da8ba3e76bb 100644
--- a/python/src/file.rs
+++ b/python/src/file.rs
@@ -18,7 +18,6 @@ use arrow_array::{RecordBatch, RecordBatchReader, UInt32Array};
 use arrow_schema::Schema as ArrowSchema;
 use bytes::Bytes;
 use futures::stream::StreamExt;
-use lance::index::vector::PartitionArtifactBuilder as CorePartitionArtifactBuilder;
 use lance::io::{ObjectStore, RecordBatchStream};
 use lance_core::cache::LanceCache;
 use lance_core::utils::path::LancePathExt;
@@ -371,82 +370,6 @@ impl Drop for LanceFileWriter {
     }
 }
 
-#[pyclass]
-pub struct PartitionArtifactBuilder {
-    inner: Arc<Mutex<CorePartitionArtifactBuilder>>,
-}
-
-impl PartitionArtifactBuilder {
-    #[allow(clippy::too_many_arguments)]
-    async fn open(
-        uri_or_path: String,
-        num_partitions: usize,
-        pq_code_width: usize,
-        storage_options: Option<HashMap<String, String>>,
-        storage_options_provider: Option<Arc<dyn lance_io::object_store::StorageOptionsProvider>>,
-    ) -> PyResult<Self> {
-        let (object_store, path) = object_store_from_uri_or_path_with_provider(
-            uri_or_path,
-            storage_options,
-            storage_options_provider,
-        )
-        .await?;
-        let inner = CorePartitionArtifactBuilder::try_new_with_store(
-            object_store,
-            path,
-            num_partitions,
-            pq_code_width,
-        )
-        .infer_error()?;
-        Ok(Self {
-            inner: Arc::new(Mutex::new(inner)),
-        })
-    }
-}
-
-#[pymethods]
-impl PartitionArtifactBuilder {
-    #[new]
-    #[pyo3(signature=(uri_or_path, num_partitions, pq_code_width, storage_options=None, storage_options_provider=None))]
-    #[allow(clippy::too_many_arguments)]
-    pub fn new(
-        uri_or_path: String,
-        num_partitions: usize,
-        pq_code_width: usize,
-        storage_options: Option<HashMap<String, String>>,
-        storage_options_provider: Option<&Bound<'_, PyAny>>,
-    ) -> PyResult<Self> {
-        let provider = storage_options_provider
-            .map(crate::storage_options::py_object_to_storage_options_provider)
-            .transpose()?;
-        rt().block_on(
-            None,
-            Self::open(
-                uri_or_path,
-                num_partitions,
-                pq_code_width,
-                storage_options,
-                provider,
-            ),
-        )?
-    }
-
-    pub fn append_batch(&self, batch: PyArrowType<RecordBatch>) -> PyResult<()> {
-        rt().runtime.block_on(async {
-            self.inner.lock().await.append_batch(&batch.0).await
-        })
-        .infer_error()
-    }
-
-    #[pyo3(signature=(metadata_file, total_loss=None))]
-    pub fn finish(&self, metadata_file: String, total_loss: Option<f64>) -> PyResult<Vec<String>> {
-        rt().runtime.block_on(async {
-            self.inner.lock().await.finish(&metadata_file, total_loss).await
-        })
-        .infer_error()
-    }
-}
-
 pub async fn object_store_from_uri_or_path_no_options(
     uri_or_path: impl AsRef<str>,
 ) -> PyResult<(Arc<ObjectStore>, Path)> {
diff --git a/python/src/lib.rs b/python/src/lib.rs
index 819e3fddc3e..9730f2ba1c5 100644
--- a/python/src/lib.rs
+++ b/python/src/lib.rs
@@ -51,8 +51,7 @@ use dataset::{
 use env_logger::{Builder, Env};
 use file::{
     LanceBufferDescriptor, LanceColumnMetadata, LanceFileMetadata, LanceFileReader,
-    LanceFileStatistics, LanceFileWriter, LancePageMetadata, PartitionArtifactBuilder,
-    stable_version,
+    LanceFileStatistics, LanceFileWriter, LancePageMetadata, stable_version,
 };
 use log::Level;
 use pyo3::exceptions::PyIOError;
@@ -259,7 +258,6 @@ fn lance(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<LanceBlobFile>()?;
     m.add_class::<LanceFileReader>()?;
     m.add_class::<LanceFileWriter>()?;
-    m.add_class::<PartitionArtifactBuilder>()?;
     m.add_class::<LanceFileSession>()?;
     m.add_class::<LanceFileMetadata>()?;
     m.add_class::<LanceFileStatistics>()?;
diff --git a/rust/lance-index/src/vector/ivf/builder.rs b/rust/lance-index/src/vector/ivf/builder.rs
index 155b33f58b9..9dfcd675be8 100644
--- a/rust/lance-index/src/vector/ivf/builder.rs
+++ b/rust/lance-index/src/vector/ivf/builder.rs
@@ -48,11 +48,6 @@ pub struct IvfBuildParams {
     /// The input is expected to be (/dir/to/buffers, [buffer1.lance, buffer2.lance, ...])
     pub precomputed_shuffle_buffers: Option<(String, Vec<String>)>,
 
-    /// Precomputed encoded dataset (_rowid/row_id -> partition_id, pq_code).
-    /// Mutually exclusive with `precomputed_partitions_file` and `precomputed_shuffle_buffers`.
-    /// Requires `centroids` to be set.
-    pub precomputed_encoded_dataset_uri: Option<String>,
-
     /// Precomputed partitioned artifact produced by an external backend.
     /// Mutually exclusive with other precomputed inputs and requires `centroids` to be set.
     pub precomputed_partition_artifact_uri: Option<String>,
@@ -76,7 +71,6 @@ impl Default for IvfBuildParams {
             sample_rate: 256, // See faiss
             precomputed_partitions_file: None,
             precomputed_shuffle_buffers: None,
-            precomputed_encoded_dataset_uri: None,
             precomputed_partition_artifact_uri: None,
             shuffle_partition_batches: 1024 * 10,
             shuffle_partition_concurrency: 2,
diff --git a/rust/lance/src/index/vector.rs b/rust/lance/src/index/vector.rs
index 48235a65582..684f12b2d96 100644
--- a/rust/lance/src/index/vector.rs
+++ b/rust/lance/src/index/vector.rs
@@ -8,7 +8,6 @@ use std::sync::Arc;
 use std::{any::Any, collections::HashMap};
 
 pub mod builder;
-mod encoded_dataset;
 pub mod ivf;
 mod partition_artifact;
 pub mod pq;
@@ -1658,7 +1657,6 @@ fn derive_ivf_params(ivf_model: &IvfModel) -> IvfBuildParams {
         sample_rate: 256, // Default
         precomputed_partitions_file: None,
         precomputed_shuffle_buffers: None,
-        precomputed_encoded_dataset_uri: None,
         precomputed_partition_artifact_uri: None,
         shuffle_partition_batches: 1024 * 10, // Default
         shuffle_partition_concurrency: 2,     // Default
diff --git a/rust/lance/src/index/vector/builder.rs b/rust/lance/src/index/vector/builder.rs
index 9c9d4b16eed..b753caadc67 100644
--- a/rust/lance/src/index/vector/builder.rs
+++ b/rust/lance/src/index/vector/builder.rs
@@ -92,7 +92,6 @@ use crate::index::vector::utils::{infer_vector_dim, infer_vector_element_type};
 
 use super::v2::IVFIndex;
 use super::{
-    encoded_dataset::EncodedDatasetShuffleReader,
     ivf::load_precomputed_partitions_if_available,
     partition_artifact::PartitionArtifactShuffleReader,
     utils::{self, get_vector_type},
@@ -228,19 +227,6 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q>
         ))
     }
 
-    async fn try_open_precomputed_encoded_dataset_reader(
-        &self,
-        uri: &str,
-    ) -> Result<Arc<dyn ShuffleReader>> {
-        let storage_options = self
-            .ivf_params
-            .as_ref()
-            .and_then(|params| params.storage_options.as_ref());
-        Ok(Arc::new(
-            EncodedDatasetShuffleReader::try_open(uri, storage_options).await?,
-        ))
-    }
-
     async fn try_open_precomputed_partition_artifact_reader(
         &self,
         uri: &str,
@@ -649,19 +635,6 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q>
             return Ok(());
         }
 
-        if let Some(uri) = self
-            .ivf_params
-            .as_ref()
-            .and_then(|params| params.precomputed_encoded_dataset_uri.as_deref())
-        {
-            log::info!("shuffle with precomputed encoded dataset from {}", uri);
-            self.shuffle_reader = Some(
-                self.try_open_precomputed_encoded_dataset_reader(uri)
-                    .await?,
-            );
-            return Ok(());
-        }
-
         let stream = match self
             .ivf_params
             .as_ref()
diff --git a/rust/lance/src/index/vector/encoded_dataset.rs b/rust/lance/src/index/vector/encoded_dataset.rs
deleted file mode 100644
index 866f903805c..00000000000
--- a/rust/lance/src/index/vector/encoded_dataset.rs
+++ /dev/null
@@ -1,370 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright The Lance Authors
-
-use std::collections::HashMap;
-use std::sync::Arc;
-
-use arrow_schema::Fields;
-use futures::StreamExt;
-use lance_core::utils::tokio::get_num_compute_intensive_cpus;
-use lance_core::{Error, ROW_ID, Result};
-use lance_index::vector::v3::shuffler::ShuffleReader;
-use lance_index::vector::{PART_ID_COLUMN, PQ_CODE_COLUMN};
-use lance_io::stream::{RecordBatchStream, RecordBatchStreamAdapter};
-use lance_table::format::Fragment;
-use log::warn;
-use serde::de::DeserializeOwned;
-
-use crate::Dataset;
-use crate::dataset::builder::DatasetBuilder;
-
-pub(crate) const PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY: &str =
-    "lance:index_build:precomputed_encoded_partition_sizes";
-pub(crate) const PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY: &str =
-    "lance:index_build:precomputed_encoded_partition_fragment_ids";
-pub(crate) const PRECOMPUTED_ENCODED_TOTAL_LOSS_METADATA_KEY: &str =
-    "lance:index_build:precomputed_encoded_total_loss";
-
-const PRECOMPUTED_ROW_ID_COLUMN: &str = "row_id";
-
-pub(crate) struct EncodedDatasetShuffleReader {
-    dataset: Dataset,
-    row_id_column: String,
-    partition_sizes: Vec<usize>,
-    partition_fragments: Option<Vec<Vec<Fragment>>>,
-    total_loss: Option<f64>,
-}
-
-impl EncodedDatasetShuffleReader {
-    pub(crate) async fn try_open(
-        uri: &str,
-        storage_options: Option<&HashMap<String, String>>,
-    ) -> Result<Self> {
-        let mut builder = DatasetBuilder::from_uri(uri);
-        if let Some(storage_options) = storage_options {
-            builder = builder.with_storage_options(storage_options.clone());
-        }
-        let dataset = builder.load().await?;
-        Self::try_new(dataset)
-    }
-
-    pub(crate) fn try_new(dataset: Dataset) -> Result<Self> {
-        let row_id_column = if dataset.schema().field(ROW_ID).is_some() {
-            ROW_ID.to_string()
-        } else if dataset.schema().field(PRECOMPUTED_ROW_ID_COLUMN).is_some() {
-            PRECOMPUTED_ROW_ID_COLUMN.to_string()
-        } else {
-            return Err(Error::invalid_input(format!(
-                "precomputed encoded dataset must contain '{}' or '{}' column",
-                ROW_ID, PRECOMPUTED_ROW_ID_COLUMN
-            )));
-        };
-
-        for required_column in [PART_ID_COLUMN, PQ_CODE_COLUMN] {
-            if dataset.schema().field(required_column).is_none() {
-                return Err(Error::invalid_input(format!(
-                    "precomputed encoded dataset is missing required column '{}'",
-                    required_column
-                )));
-            }
-        }
-
-        let metadata = dataset.metadata();
-        let partition_sizes: Vec<usize> =
-            parse_required_metadata(metadata, PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY)?;
-
-        let partition_fragments = parse_optional_metadata::<Vec<Vec<u64>>>(
-            metadata,
-            PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY,
-        )?
-        .map(|partition_fragment_ids| resolve_partition_fragments(&dataset, partition_fragment_ids))
-        .transpose()?;
-
-        if let Some(partition_fragments) = partition_fragments.as_ref() {
-            if partition_fragments.len() != partition_sizes.len() {
-                return Err(Error::invalid_input(format!(
-                    "metadata '{}' has {} partitions but '{}' has {}",
-                    PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY,
-                    partition_fragments.len(),
-                    PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY,
-                    partition_sizes.len(),
-                )));
-            }
-        }
-
-        let total_loss =
-            parse_optional_metadata::<f64>(metadata, PRECOMPUTED_ENCODED_TOTAL_LOSS_METADATA_KEY)?;
-
-        Ok(Self {
-            dataset,
-            row_id_column,
-            partition_sizes,
-            partition_fragments,
-            total_loss,
-        })
-    }
-
-    fn rename_row_id(
-        stream: impl RecordBatchStream + Unpin + 'static,
-        row_id_idx: usize,
-    ) -> impl RecordBatchStream + Unpin + 'static {
-        let new_schema = Arc::new(arrow_schema::Schema::new(
-            stream
-                .schema()
-                .fields
-                .iter()
-                .enumerate()
-                .map(|(field_idx, field)| {
-                    if field_idx == row_id_idx {
-                        arrow_schema::Field::new(
-                            ROW_ID,
-                            field.data_type().clone(),
-                            field.is_nullable(),
-                        )
-                    } else {
-                        field.as_ref().clone()
-                    }
-                })
-                .collect::<Fields>(),
-        ));
-        RecordBatchStreamAdapter::new(
-            new_schema.clone(),
-            stream.map(move |batch| match batch {
-                Ok(batch) => {
-                    arrow_array::RecordBatch::try_new(new_schema.clone(), batch.columns().to_vec())
-                        .map_err(Error::from)
-                }
-                Err(error) => Err(error),
-            }),
-        )
-    }
-}
-
-#[async_trait::async_trait]
-impl ShuffleReader for EncodedDatasetShuffleReader {
-    async fn read_partition(
-        &self,
-        partition_id: usize,
-    ) -> Result<Option<Box<dyn RecordBatchStream + Unpin + 'static>>> {
-        if partition_id >= self.partition_sizes.len() {
-            return Ok(None);
-        }
-        if self.partition_sizes[partition_id] == 0 {
-            return Ok(None);
-        }
-
-        let mut scanner = self.dataset.scan();
-        scanner.batch_readahead(get_num_compute_intensive_cpus());
-        scanner.project(&[self.row_id_column.as_str(), PART_ID_COLUMN, PQ_CODE_COLUMN])?;
-
-        if let Some(partition_fragments) = self.partition_fragments.as_ref() {
-            let fragments = &partition_fragments[partition_id];
-            if fragments.is_empty() {
-                warn!(
-                    "precomputed encoded dataset metadata has no fragments for non-empty partition {}, falling back to filtered scan",
-                    partition_id
-                );
-            } else {
-                scanner.with_fragments(fragments.clone());
-            }
-        }
-
-        scanner.filter(&format!("{PART_ID_COLUMN} = {partition_id}"))?;
-        let stream = scanner.try_into_stream().await?;
-        if let Some((row_id_idx, _)) = stream.schema().column_with_name(PRECOMPUTED_ROW_ID_COLUMN) {
-            Ok(Some(Box::new(Self::rename_row_id(stream, row_id_idx))))
-        } else {
-            Ok(Some(Box::new(stream)))
-        }
-    }
-
-    fn partition_size(&self, partition_id: usize) -> Result<usize> {
-        Ok(self.partition_sizes.get(partition_id).copied().unwrap_or(0))
-    }
-
-    fn total_loss(&self) -> Option<f64> {
-        self.total_loss
-    }
-}
-
-fn parse_required_metadata<T: DeserializeOwned>(
-    metadata: &HashMap<String, String>,
-    key: &str,
-) -> Result<T> {
-    let value = metadata.get(key).ok_or_else(|| {
-        Error::invalid_input(format!(
-            "precomputed encoded dataset is missing required metadata '{}'",
-            key
-        ))
-    })?;
-    parse_metadata_value(value, key)
-}
-
-fn parse_optional_metadata<T: DeserializeOwned>(
-    metadata: &HashMap<String, String>,
-    key: &str,
-) -> Result<Option<T>> {
-    metadata
-        .get(key)
-        .map(|value| parse_metadata_value(value, key))
-        .transpose()
-}
-
-fn parse_metadata_value<T: DeserializeOwned>(value: &str, key: &str) -> Result<T> {
-    serde_json::from_str(value).map_err(|error| {
-        Error::invalid_input(format!(
-            "failed to parse precomputed encoded dataset metadata '{}' from '{}': {}",
-            key, value, error
-        ))
-    })
-}
-
-fn resolve_partition_fragments(
-    dataset: &Dataset,
-    partition_fragment_ids: Vec<Vec<u64>>,
-) -> Result<Vec<Vec<Fragment>>> {
-    let fragments_by_id = dataset
-        .fragments()
-        .iter()
-        .cloned()
-        .map(|fragment| (fragment.id, fragment))
-        .collect::<HashMap<_, _>>();
-
-    partition_fragment_ids
-        .into_iter()
-        .map(|fragment_ids| {
-            fragment_ids
-                .into_iter()
-                .map(|fragment_id| {
-                    fragments_by_id.get(&fragment_id).cloned().ok_or_else(|| {
-                        Error::invalid_input(format!(
-                            "precomputed encoded dataset metadata references unknown fragment id {}",
-                            fragment_id
-                        ))
-                    })
-                })
-                .collect()
-        })
-        .collect()
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use arrow_array::{
-        FixedSizeListArray, RecordBatch, RecordBatchIterator, UInt8Array, UInt32Array, UInt64Array,
-        cast::AsArray,
-    };
-    use futures::TryStreamExt;
-    use lance_arrow::FixedSizeListArrayExt;
-
-    use crate::dataset::WriteParams;
-
-    #[tokio::test]
-    async fn encoded_dataset_reader_reads_mapped_fragments_and_renames_row_id() {
-        let schema = Arc::new(arrow_schema::Schema::new(vec![
-            arrow_schema::Field::new("row_id", arrow_schema::DataType::UInt64, false),
-            arrow_schema::Field::new(PART_ID_COLUMN, arrow_schema::DataType::UInt32, false),
-            arrow_schema::Field::new(
-                PQ_CODE_COLUMN,
-                arrow_schema::DataType::FixedSizeList(
-                    Arc::new(arrow_schema::Field::new(
-                        "item",
-                        arrow_schema::DataType::UInt8,
-                        true,
-                    )),
-                    2,
-                ),
-                true,
-            ),
-        ]));
-
-        let batch1 = RecordBatch::try_new(
-            schema.clone(),
-            vec![
-                Arc::new(UInt64Array::from(vec![10_u64, 11])),
-                Arc::new(UInt32Array::from(vec![0_u32, 1])),
-                Arc::new(
-                    FixedSizeListArray::try_new_from_values(UInt8Array::from(vec![1, 2, 3, 4]), 2)
-                        .unwrap(),
-                ),
-            ],
-        )
-        .unwrap();
-        let batch2 = RecordBatch::try_new(
-            schema.clone(),
-            vec![
-                Arc::new(UInt64Array::from(vec![12_u64, 13])),
-                Arc::new(UInt32Array::from(vec![1_u32, 1])),
-                Arc::new(
-                    FixedSizeListArray::try_new_from_values(UInt8Array::from(vec![5, 6, 7, 8]), 2)
-                        .unwrap(),
-                ),
-            ],
-        )
-        .unwrap();
-
-        let reader = RecordBatchIterator::new(vec![Ok(batch1), Ok(batch2)], schema);
-        let write_params = WriteParams {
-            max_rows_per_file: 2,
-            max_rows_per_group: 2,
-            ..Default::default()
-        };
-        let mut dataset = Dataset::write(
-            reader,
-            "memory://precomputed-encoded-reader",
-            Some(write_params),
-        )
-        .await
-        .unwrap();
-
-        let fragment_ids = dataset
-            .get_fragments()
-            .into_iter()
-            .map(|fragment| fragment.metadata().id)
-            .collect::<Vec<_>>();
-        assert_eq!(fragment_ids.len(), 2);
-
-        dataset
-            .update_metadata(vec![
-                (
-                    PRECOMPUTED_ENCODED_PARTITION_SIZES_METADATA_KEY.to_string(),
-                    serde_json::to_string(&vec![1_usize, 3]).unwrap(),
-                ),
-                (
-                    PRECOMPUTED_ENCODED_PARTITION_FRAGMENT_IDS_METADATA_KEY.to_string(),
-                    serde_json::to_string(&vec![
-                        vec![fragment_ids[0] as u64],
-                        vec![fragment_ids[0] as u64, fragment_ids[1] as u64],
-                    ])
-                    .unwrap(),
-                ),
-            ])
-            .await
-            .unwrap();
-
-        let reader = EncodedDatasetShuffleReader::try_new(dataset).unwrap();
-        assert_eq!(reader.partition_size(0).unwrap(), 1);
-        assert_eq!(reader.partition_size(1).unwrap(), 3);
-
-        let stream = reader.read_partition(1).await.unwrap().unwrap();
-        let batches = stream.try_collect::<Vec<_>>().await.unwrap();
-        let row_ids = batches
-            .iter()
-            .flat_map(|batch| {
-                batch[ROW_ID]
-                    .as_primitive::<arrow::datatypes::UInt64Type>()
-                    .values()
-                    .iter()
-                    .copied()
-            })
-            .collect::<Vec<_>>();
-        assert_eq!(row_ids, vec![11, 12, 13]);
-        assert!(
-            batches
-                .iter()
-                .all(|batch| batch.column_by_name("row_id").is_none())
-        );
-    }
-}
diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs
index 4841c98d661..9f51459d55f 100644
--- a/rust/lance/src/index/vector/ivf.rs
+++ b/rust/lance/src/index/vector/ivf.rs
@@ -1204,12 +1204,6 @@ fn sanity_check_ivf_params(ivf: &IvfBuildParams) -> Result<()> {
         ));
     }
 
-    if ivf.precomputed_encoded_dataset_uri.is_some() && ivf.centroids.is_none() {
-        return Err(Error::index(
-            "precomputed_encoded_dataset_uri requires centroids to be set".to_string(),
-        ));
-    }
-
     if ivf.precomputed_partition_artifact_uri.is_some() && ivf.centroids.is_none() {
         return Err(Error::index(
             "precomputed_partition_artifact_uri requires centroids to be set".to_string(),
@@ -1223,20 +1217,6 @@ fn sanity_check_ivf_params(ivf: &IvfBuildParams) -> Result<()> {
         ));
     }
 
-    if ivf.precomputed_encoded_dataset_uri.is_some() && ivf.precomputed_partitions_file.is_some() {
-        return Err(Error::index(
-            "precomputed_encoded_dataset_uri and precomputed_partitions_file are mutually exclusive"
-                .to_string(),
-        ));
-    }
-
-    if ivf.precomputed_encoded_dataset_uri.is_some() && ivf.precomputed_shuffle_buffers.is_some() {
-        return Err(Error::index(
-            "precomputed_encoded_dataset_uri and precomputed_shuffle_buffers are mutually exclusive"
-                .to_string(),
-        ));
-    }
-
     if ivf.precomputed_partition_artifact_uri.is_some() && ivf.precomputed_partitions_file.is_some()
     {
         return Err(Error::index(
@@ -1253,15 +1233,6 @@ fn sanity_check_ivf_params(ivf: &IvfBuildParams) -> Result<()> {
         ));
     }
 
-    if ivf.precomputed_partition_artifact_uri.is_some()
-        && ivf.precomputed_encoded_dataset_uri.is_some()
-    {
-        return Err(Error::index(
-            "precomputed_partition_artifact_uri and precomputed_encoded_dataset_uri are mutually exclusive"
-                .to_string(),
-        ));
-    }
-
     Ok(())
 }
 
@@ -1273,12 +1244,6 @@ fn sanity_check_params(ivf: &IvfBuildParams, pq: &PQBuildParams) -> Result<()> {
         ));
     }
 
-    if ivf.precomputed_encoded_dataset_uri.is_some() && pq.codebook.is_none() {
-        return Err(Error::index(
-            "precomputed_encoded_dataset_uri requires codebooks to be set".to_string(),
-        ));
-    }
-
     if ivf.precomputed_partition_artifact_uri.is_some() && pq.codebook.is_none() {
         return Err(Error::index(
             "precomputed_partition_artifact_uri requires codebooks to be set".to_string(),

From 5ea24989c238e3c0f8de8cf673aaf2e1f3a70213 Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Wed, 8 Apr 2026 20:01:02 +0800
Subject: [PATCH 17/21] python: delegate cuvs acceleration to external backend

---
 python/python/lance/cuvs.py              | 111 ++++++++++++++++
 python/python/lance/dataset.py           | 137 ++++++++++++--------
 python/python/lance/indices/builder.py   |  33 +++--
 python/python/tests/test_vector_index.py | 155 +++++++++++++++++++++--
 4 files changed, 362 insertions(+), 74 deletions(-)
 create mode 100644 python/python/lance/cuvs.py

diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py
new file mode 100644
index 00000000000..ba7a1a67738
--- /dev/null
+++ b/python/python/lance/cuvs.py
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright The Lance Authors
+
+from __future__ import annotations
+
+import os
+import tempfile
+from importlib import import_module
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+
+def is_cuvs_accelerator(accelerator: object) -> bool:
+    return isinstance(accelerator, str) and accelerator.lower() == "cuvs"
+
+
+def _require_lance_cuvs():
+    try:
+        return import_module("lance_cuvs")
+    except ModuleNotFoundError as exc:
+        raise ModuleNotFoundError(
+            "accelerator='cuvs' requires the external 'lance-cuvs' package "
+            "to be installed."
+        ) from exc
+
+
+def build_vector_index_on_cuvs(
+    dataset,
+    column: str,
+    metric_type: str,
+    accelerator: str,
+    num_partitions: int,
+    num_sub_vectors: int,
+    dst_dataset_uri: str | Path | None = None,
+    *,
+    sample_rate: int = 256,
+    max_iters: int = 50,
+    num_bits: int = 8,
+    batch_size: int = 1024 * 128,
+    filter_nan: bool = True,
+):
+    if not is_cuvs_accelerator(accelerator):
+        raise ValueError("build_vector_index_on_cuvs requires accelerator='cuvs'")
+
+    backend = _require_lance_cuvs()
+    artifact_uri = (
+        os.fspath(dst_dataset_uri)
+        if dst_dataset_uri is not None
+        else tempfile.mkdtemp(prefix="lance-cuvs-artifact-")
+    )
+    training = backend.train_ivf_pq(
+        dataset.uri,
+        column,
+        metric_type=metric_type,
+        num_partitions=num_partitions,
+        num_sub_vectors=num_sub_vectors,
+        sample_rate=sample_rate,
+        max_iters=max_iters,
+        num_bits=num_bits,
+        filter_nan=filter_nan,
+    )
+    artifact = backend.build_ivf_pq_artifact(
+        dataset.uri,
+        column,
+        training=training,
+        artifact_uri=artifact_uri,
+        batch_size=batch_size,
+        filter_nan=filter_nan,
+    )
+    return (
+        artifact.artifact_uri,
+        artifact.files,
+        training.ivf_centroids(),
+        training.pq_codebook(),
+    )
+
+
+def prepare_global_ivf_pq_on_cuvs(
+    dataset,
+    column: str,
+    num_partitions: int,
+    num_sub_vectors: int,
+    *,
+    distance_type: str = "l2",
+    accelerator: str = "cuvs",
+    sample_rate: int = 256,
+    max_iters: int = 50,
+    num_bits: int = 8,
+    filter_nan: bool = True,
+):
+    if not is_cuvs_accelerator(accelerator):
+        raise ValueError("prepare_global_ivf_pq_on_cuvs requires accelerator='cuvs'")
+
+    backend = _require_lance_cuvs()
+    training = backend.train_ivf_pq(
+        dataset.uri,
+        column,
+        metric_type=distance_type,
+        num_partitions=num_partitions,
+        num_sub_vectors=num_sub_vectors,
+        sample_rate=sample_rate,
+        max_iters=max_iters,
+        num_bits=num_bits,
+        filter_nan=filter_nan,
+    )
+    return {
+        "ivf_centroids": training.ivf_centroids(),
+        "pq_codebook": training.pq_codebook(),
+    }
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
index 32e9e548d68..7c3a7bcee06 100644
--- a/python/python/lance/dataset.py
+++ b/python/python/lance/dataset.py
@@ -39,6 +39,7 @@
 from lance.log import LOGGER
 
 from .blob import BlobFile
+from .cuvs import is_cuvs_accelerator
 from .dependencies import (
     _check_for_numpy,
     _check_for_torch,
@@ -2899,19 +2900,14 @@ def _create_index_impl(
 
         # Handle timing for various parts of accelerated builds
         timers = {}
-        if isinstance(accelerator, str) and accelerator.lower().startswith("cuvs"):
-            raise ValueError(
-                "accelerator='cuvs' is not built into Lance. "
-                "Use the external 'lance-cuvs' package to produce a "
-                "precomputed partition artifact and then call create_index "
-                "with precomputed_partition_artifact_uri."
-            )
+        use_cuvs = is_cuvs_accelerator(accelerator)
         if accelerator is not None and index_type != "IVF_PQ":
             LOGGER.warning(
                 "Index type %s does not support GPU acceleration; falling back to CPU",
                 index_type,
             )
             accelerator = None
+            use_cuvs = False
 
         # IMPORTANT: Distributed indexing is CPU-only. Enforce single-node when
         # accelerator or torch-related paths are detected.
@@ -2960,52 +2956,79 @@ def _create_index_impl(
                 num_partitions = _target_partition_size_to_num_partitions(
                     num_rows, target_partition_size
                 )
-            from .vector import (
-                one_pass_assign_ivf_pq_on_accelerator,
-                one_pass_train_ivf_pq_on_accelerator,
-            )
+            if use_cuvs:
+                from .cuvs import build_vector_index_on_cuvs
+
+                LOGGER.info("Doing cuVS vector backend build")
+                timers["ivf+pq_build:start"] = time.time()
+                artifact_root, _, ivf_centroids, pq_codebook = build_vector_index_on_cuvs(
+                    self,
+                    column[0],
+                    metric,
+                    accelerator,
+                    num_partitions,
+                    num_sub_vectors,
+                    sample_rate=kwargs.get("sample_rate", 256),
+                    max_iters=kwargs.get("max_iters", 50),
+                    num_bits=kwargs.get("num_bits", 8),
+                    batch_size=1024 * 128,
+                    filter_nan=filter_nan,
+                )
+                kwargs["precomputed_partition_artifact_uri"] = artifact_root
+                timers["ivf+pq_build:end"] = time.time()
+                ivfpq_build_time = (
+                    timers["ivf+pq_build:end"] - timers["ivf+pq_build:start"]
+                )
+                LOGGER.info("cuVS ivf+pq build time: %ss", ivfpq_build_time)
+            else:
+                from .vector import (
+                    one_pass_assign_ivf_pq_on_accelerator,
+                    one_pass_train_ivf_pq_on_accelerator,
+                )
 
-            LOGGER.info("Doing one-pass ivfpq accelerated computations")
-            timers["ivf+pq_train:start"] = time.time()
-            (
-                ivf_centroids,
-                ivf_kmeans,
-                pq_codebook,
-                pq_kmeans_list,
-            ) = one_pass_train_ivf_pq_on_accelerator(
-                self,
-                column[0],
-                num_partitions,
-                metric,
-                accelerator,
-                num_sub_vectors=num_sub_vectors,
-                batch_size=20480,
-                filter_nan=filter_nan,
-            )
-            timers["ivf+pq_train:end"] = time.time()
-            ivfpq_train_time = timers["ivf+pq_train:end"] - timers["ivf+pq_train:start"]
-            LOGGER.info("ivf+pq training time: %ss", ivfpq_train_time)
-            timers["ivf+pq_assign:start"] = time.time()
-            shuffle_output_dir, shuffle_buffers = one_pass_assign_ivf_pq_on_accelerator(
-                self,
-                column[0],
-                metric,
-                accelerator,
-                ivf_kmeans,
-                pq_kmeans_list,
-                batch_size=20480,
-                filter_nan=filter_nan,
-            )
-            timers["ivf+pq_assign:end"] = time.time()
-            ivfpq_assign_time = (
-                timers["ivf+pq_assign:end"] - timers["ivf+pq_assign:start"]
-            )
-            LOGGER.info("ivf+pq transform time: %ss", ivfpq_assign_time)
+                LOGGER.info("Doing one-pass ivfpq accelerated computations")
+                timers["ivf+pq_train:start"] = time.time()
+                (
+                    ivf_centroids,
+                    ivf_kmeans,
+                    pq_codebook,
+                    pq_kmeans_list,
+                ) = one_pass_train_ivf_pq_on_accelerator(
+                    self,
+                    column[0],
+                    num_partitions,
+                    metric,
+                    accelerator,
+                    num_sub_vectors=num_sub_vectors,
+                    batch_size=20480,
+                    filter_nan=filter_nan,
+                )
+                timers["ivf+pq_train:end"] = time.time()
+                ivfpq_train_time = (
+                    timers["ivf+pq_train:end"] - timers["ivf+pq_train:start"]
+                )
+                LOGGER.info("ivf+pq training time: %ss", ivfpq_train_time)
+                timers["ivf+pq_assign:start"] = time.time()
+                shuffle_output_dir, shuffle_buffers = one_pass_assign_ivf_pq_on_accelerator(
+                    self,
+                    column[0],
+                    metric,
+                    accelerator,
+                    ivf_kmeans,
+                    pq_kmeans_list,
+                    batch_size=20480,
+                    filter_nan=filter_nan,
+                )
+                timers["ivf+pq_assign:end"] = time.time()
+                ivfpq_assign_time = (
+                    timers["ivf+pq_assign:end"] - timers["ivf+pq_assign:start"]
+                )
+                LOGGER.info("ivf+pq transform time: %ss", ivfpq_assign_time)
 
-            kwargs["precomputed_shuffle_buffers"] = shuffle_buffers
-            kwargs["precomputed_shuffle_buffers_path"] = os.path.join(
-                shuffle_output_dir, "data"
-            )
+                kwargs["precomputed_shuffle_buffers"] = shuffle_buffers
+                kwargs["precomputed_shuffle_buffers_path"] = os.path.join(
+                    shuffle_output_dir, "data"
+                )
         if index_type.startswith("IVF"):
             if (ivf_centroids is not None) and (ivf_centroids_file is not None):
                 raise ValueError(
@@ -3243,7 +3266,12 @@ def create_index(
             The number of sub-vectors for PQ (Product Quantization).
         accelerator : str or ``torch.Device``, optional
             If set, use an accelerator to speed up the training process.
-            Accepted accelerator: "cuda" (Nvidia GPU) and "mps" (Apple Silicon GPU).
+            Accepted accelerator:
+
+            - "cuda" (Nvidia GPU)
+            - "mps" (Apple Silicon GPU)
+            - "cuvs" for the external `lance-cuvs` backend
+
             If not set, use the CPU.
         index_cache_size : int, optional
             The size of the index cache in number of entries. Default value is 256.
@@ -3357,8 +3385,9 @@ def create_index(
         Experimental Accelerator (GPU) support:
 
         - *accelerate*: use GPU to train IVF partitions.
-            Only supports CUDA (Nvidia) or MPS (Apple) currently.
-            Requires PyTorch being installed.
+            Supports CUDA (Nvidia) and MPS (Apple) via the built-in torch path.
+            `accelerator="cuvs"` delegates IVF_PQ build preparation to the
+            external `lance-cuvs` package.
 
         .. code-block:: python
 
diff --git a/python/python/lance/indices/builder.py b/python/python/lance/indices/builder.py
index a13e92faf8f..00591ead934 100644
--- a/python/python/lance/indices/builder.py
+++ b/python/python/lance/indices/builder.py
@@ -8,6 +8,8 @@
 
 import numpy as np
 import pyarrow as pa
+
+from lance.cuvs import is_cuvs_accelerator, prepare_global_ivf_pq_on_cuvs
 from lance.indices.ivf import IvfModel
 from lance.indices.pq import PqModel
 
@@ -114,11 +116,10 @@ def train_ivf(
         self._verify_ivf_sample_rate(sample_rate, num_partitions, num_rows)
         distance_type = self._normalize_distance_type(distance_type)
         self._verify_ivf_params(num_partitions)
-        if isinstance(accelerator, str) and accelerator.lower().startswith("cuvs"):
-            raise ValueError(
-                "accelerator='cuvs' is not built into Lance. "
-                "Use the external 'lance-cuvs' package to build training outputs "
-                "and partition artifacts."
+        if is_cuvs_accelerator(accelerator):
+            raise NotImplementedError(
+                "IndicesBuilder.train_ivf does not support accelerator='cuvs'; "
+                "use prepare_global_ivf_pq instead"
             )
 
         if accelerator is None:
@@ -255,11 +256,23 @@ def prepare_global_ivf_pq(
         `IndicesBuilder.train_pq` (indices.train_pq_model). No public method
         names elsewhere are changed.
         """
-        if isinstance(accelerator, str) and accelerator.lower().startswith("cuvs"):
-            raise ValueError(
-                "accelerator='cuvs' is not built into Lance. "
-                "Use the external 'lance-cuvs' package to build training outputs "
-                "and partition artifacts."
+        if is_cuvs_accelerator(accelerator):
+            if fragment_ids is not None:
+                raise NotImplementedError(
+                    "fragment_ids is not supported with accelerator='cuvs'"
+                )
+            num_rows = self._count_rows()
+            num_partitions = self._determine_num_partitions(num_partitions, num_rows)
+            num_subvectors = self._normalize_pq_params(num_subvectors, self.dimension)
+            return prepare_global_ivf_pq_on_cuvs(
+                self.dataset,
+                self.column[0],
+                num_partitions,
+                num_subvectors,
+                distance_type=distance_type,
+                accelerator=accelerator,
+                sample_rate=sample_rate,
+                max_iters=max_iters,
             )
 
         # Global IVF training
diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py
index 9606c91a724..e5f10028839 100644
--- a/python/python/tests/test_vector_index.py
+++ b/python/python/tests/test_vector_index.py
@@ -9,9 +9,11 @@
 import string
 import tempfile
 import time
+from pathlib import Path
 from typing import Optional
 
 import lance
+import lance.cuvs as lance_cuvs
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
@@ -534,11 +536,20 @@ def test_create_index_accelerator_fallback(tmp_path, caplog):
     )
 
 
-def test_create_index_rejects_cuvs_accelerator(tmp_path):
+def test_create_index_requires_external_cuvs_backend(tmp_path, monkeypatch):
     tbl = create_table()
     dataset = lance.write_dataset(tbl, tmp_path)
+    original_import_module = lance_cuvs.import_module
 
-    with pytest.raises(ValueError, match="not built into Lance"):
+    def _raise_missing(name):
+        if name == "lance_cuvs":
+            raise ModuleNotFoundError("No module named 'lance_cuvs'")
+        return original_import_module(name)
+
+    monkeypatch.setattr(lance_cuvs, "import_module", _raise_missing)
+    with pytest.raises(
+        ModuleNotFoundError, match="requires the external 'lance-cuvs' package"
+    ):
         dataset.create_index(
             "vector",
             index_type="IVF_PQ",
@@ -548,18 +559,142 @@ def test_create_index_rejects_cuvs_accelerator(tmp_path):
         )
 
 
-def test_prepare_global_ivf_pq_rejects_cuvs_accelerator(tmp_path):
+class _FakeCuvsTraining:
+    def __init__(self, ivf_centroids, pq_codebook):
+        self._ivf_centroids = ivf_centroids
+        self._pq_codebook = pq_codebook
+
+    def ivf_centroids(self):
+        return self._ivf_centroids
+
+    def pq_codebook(self):
+        return self._pq_codebook
+
+
+class _FakeCuvsArtifact:
+    def __init__(self, artifact_uri, files):
+        self.artifact_uri = artifact_uri
+        self.files = files
+
+
+def _make_fake_cuvs_training(num_partitions: int = 4, dimension: int = 128):
+    centroids = pa.FixedSizeListArray.from_arrays(
+        pa.array(np.arange(num_partitions * dimension, dtype=np.float32)),
+        dimension,
+    )
+    codebook = pa.FixedSizeListArray.from_arrays(
+        pa.array(np.arange(16 * 256 * 8, dtype=np.float32)),
+        8,
+    )
+    return _FakeCuvsTraining(centroids, codebook)
+
+
+def test_build_vector_index_on_cuvs_delegates_to_external_backend(tmp_path, monkeypatch):
     ds = _make_sample_dataset_base(tmp_path, "prepare_ivf_pq_cuvs_ds", 512, 128)
-    builder = IndicesBuilder(ds, "vector")
-    with pytest.raises(ValueError, match="not built into Lance"):
-        builder.prepare_global_ivf_pq(
-            num_partitions=4,
-            num_subvectors=16,
-            distance_type="l2",
-            accelerator="cuvs",
+    calls = {}
+    training = _make_fake_cuvs_training()
+
+    class _FakeBackend:
+        def train_ivf_pq(self, dataset_uri, column, **kwargs):
+            calls["train"] = {
+                "dataset_uri": dataset_uri,
+                "column": column,
+                **kwargs,
+            }
+            return training
+
+        def build_ivf_pq_artifact(self, dataset_uri, column, **kwargs):
+            calls["build"] = {
+                "dataset_uri": dataset_uri,
+                "column": column,
+                **kwargs,
+            }
+            return _FakeCuvsArtifact(
+                artifact_uri=str(tmp_path / "artifact"),
+                files=[str(tmp_path / "artifact" / "data.lance")],
+            )
+
+    monkeypatch.setattr(lance_cuvs, "_require_lance_cuvs", lambda: _FakeBackend())
+
+    artifact_uri, files, ivf_centroids, pq_codebook = (
+        lance_cuvs.build_vector_index_on_cuvs(
+            ds,
+            "vector",
+            "l2",
+            "cuvs",
+            4,
+            16,
+            dst_dataset_uri=tmp_path / "artifact_root",
             sample_rate=7,
             max_iters=20,
+            num_bits=4,
+            batch_size=4096,
+            filter_nan=False,
         )
+    )
+
+    assert calls["train"] == {
+        "dataset_uri": ds.uri,
+        "column": "vector",
+        "metric_type": "l2",
+        "num_partitions": 4,
+        "num_sub_vectors": 16,
+        "sample_rate": 7,
+        "max_iters": 20,
+        "num_bits": 4,
+        "filter_nan": False,
+    }
+    assert calls["build"]["dataset_uri"] == ds.uri
+    assert calls["build"]["column"] == "vector"
+    assert calls["build"]["training"] is training
+    assert calls["build"]["artifact_uri"] == str(tmp_path / "artifact_root")
+    assert calls["build"]["batch_size"] == 4096
+    assert calls["build"]["filter_nan"] is False
+    assert artifact_uri == str(tmp_path / "artifact")
+    assert files == [str(tmp_path / "artifact" / "data.lance")]
+    assert ivf_centroids.equals(training.ivf_centroids())
+    assert pq_codebook.equals(training.pq_codebook())
+
+
+def test_prepare_global_ivf_pq_delegates_to_external_cuvs_backend(tmp_path, monkeypatch):
+    ds = _make_sample_dataset_base(tmp_path, "prepare_ivf_pq_cuvs_ds", 512, 128)
+    builder = IndicesBuilder(ds, "vector")
+    training = _make_fake_cuvs_training()
+    calls = {}
+
+    class _FakeBackend:
+        def train_ivf_pq(self, dataset_uri, column, **kwargs):
+            calls["train"] = {
+                "dataset_uri": dataset_uri,
+                "column": column,
+                **kwargs,
+            }
+            return training
+
+    monkeypatch.setattr(lance_cuvs, "_require_lance_cuvs", lambda: _FakeBackend())
+
+    prepared = builder.prepare_global_ivf_pq(
+        num_partitions=4,
+        num_subvectors=16,
+        distance_type="l2",
+        accelerator="cuvs",
+        sample_rate=7,
+        max_iters=20,
+    )
+
+    assert calls["train"] == {
+        "dataset_uri": ds.uri,
+        "column": "vector",
+        "metric_type": "l2",
+        "num_partitions": 4,
+        "num_sub_vectors": 16,
+        "sample_rate": 7,
+        "max_iters": 20,
+        "num_bits": 8,
+        "filter_nan": True,
+    }
+    assert prepared["ivf_centroids"].equals(training.ivf_centroids())
+    assert prepared["pq_codebook"].equals(training.pq_codebook())
 
 
 def test_create_index_rejects_missing_precomputed_partition_artifact(tmp_path):

From d60e11f8f63d72c776f290e990bb135e35215bbc Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Wed, 8 Apr 2026 20:05:10 +0800
Subject: [PATCH 18/21] fix: remove merge leftover import

---
 rust/lance/src/index/vector/builder.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/rust/lance/src/index/vector/builder.rs b/rust/lance/src/index/vector/builder.rs
index fe801fc68aa..3e85a9d40b3 100644
--- a/rust/lance/src/index/vector/builder.rs
+++ b/rust/lance/src/index/vector/builder.rs
@@ -3,7 +3,6 @@
 
 use std::cmp::Ordering;
 use std::collections::HashSet;
-use std::future;
 use std::path::Path as StdPath;
 use std::sync::Arc;
 use std::{collections::HashMap, pin::Pin};

From f2548505e7b05488fe9cef4fba2aeeddfe3e8a6c Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Thu, 9 Apr 2026 00:29:18 +0800
Subject: [PATCH 19/21] refactor: drop transitional cuvs compatibility paths

---
 python/src/dataset.rs                       |   9 +-
 rust/lance-index/src/vector/ivf/builder.rs  |   3 +-
 rust/lance-index/src/vector/ivf/shuffler.rs |  10 +-
 rust/lance-index/src/vector/v3/shuffler.rs  | 202 +-------------------
 rust/lance/src/index/vector/builder.rs      | 133 +------------
 rust/lance/src/index/vector/ivf.rs          |   4 +-
 rust/lance/src/index/vector/ivf/builder.rs  |   5 +-
 rust/lance/src/index/vector/utils.rs        |   2 +-
 8 files changed, 30 insertions(+), 338 deletions(-)

diff --git a/python/src/dataset.rs b/python/src/dataset.rs
index 825f835f56a..4b058ce8382 100644
--- a/python/src/dataset.rs
+++ b/python/src/dataset.rs
@@ -20,6 +20,7 @@ use chrono::{Duration, TimeDelta, Utc};
 use futures::{StreamExt, TryFutureExt};
 use lance_index::vector::bq::RQBuildParams;
 use log::error;
+use object_store::path::Path;
 use pyo3::exceptions::{PyStopIteration, PyTypeError};
 use pyo3::types::{PyBytes, PyInt, PyList, PySet, PyString, PyTuple};
 use pyo3::{IntoPyObjectExt, prelude::*};
@@ -3624,12 +3625,18 @@ fn prepare_vector_index_params(
             kwargs.get_item("precomputed_shuffle_buffers_path")?,
         ) {
             (Some(l), Some(p)) => {
+                let path = Path::parse(p.to_string()).map_err(|e| {
+                    PyValueError::new_err(format!(
+                        "Failed to parse precomputed_shuffle_buffers_path: {}",
+                        e
+                    ))
+                })?;
                 let list = l
                     .downcast::<PyList>()?
                     .iter()
                     .map(|f| f.to_string())
                     .collect();
-                ivf_params.precomputed_shuffle_buffers = Some((p.to_string(), list));
+                ivf_params.precomputed_shuffle_buffers = Some((path, list));
             }
             (None, None) => {}
             _ => {
diff --git a/rust/lance-index/src/vector/ivf/builder.rs b/rust/lance-index/src/vector/ivf/builder.rs
index 9dfcd675be8..caccd92d6c4 100644
--- a/rust/lance-index/src/vector/ivf/builder.rs
+++ b/rust/lance-index/src/vector/ivf/builder.rs
@@ -11,6 +11,7 @@ use arrow_array::{Array, FixedSizeListArray, UInt32Array, UInt64Array};
 use futures::TryStreamExt;
 use lance_core::error::{Error, Result};
 use lance_io::stream::RecordBatchStream;
+use object_store::path::Path;
 
 /// Parameters to build IVF partitions
 #[derive(Debug, Clone)]
@@ -46,7 +47,7 @@ pub struct IvfBuildParams {
     /// requires `centroids` to be set
     ///
     /// The input is expected to be (/dir/to/buffers, [buffer1.lance, buffer2.lance, ...])
-    pub precomputed_shuffle_buffers: Option<(String, Vec<String>)>,
+    pub precomputed_shuffle_buffers: Option<(Path, Vec<String>)>,
 
     /// Precomputed partitioned artifact produced by an external backend.
     /// Mutually exclusive with other precomputed inputs and requires `centroids` to be set.
diff --git a/rust/lance-index/src/vector/ivf/shuffler.rs b/rust/lance-index/src/vector/ivf/shuffler.rs
index f78be7b0be2..f4e03c8f036 100644
--- a/rust/lance-index/src/vector/ivf/shuffler.rs
+++ b/rust/lance-index/src/vector/ivf/shuffler.rs
@@ -246,18 +246,12 @@ pub async fn shuffle_dataset(
     num_partitions: u32,
     shuffle_partition_batches: usize,
     shuffle_partition_concurrency: usize,
-    precomputed_shuffle_buffers: Option<(String, Vec<String>)>,
+    precomputed_shuffle_buffers: Option<(Path, Vec<String>)>,
 ) -> Result<Vec<impl Stream<Item = Result<RecordBatch>>>> {
     // step 1: either use precomputed shuffle files or write shuffle data to a file
     let shuffler = if let Some((path, buffers)) = precomputed_shuffle_buffers {
         info!("Precomputed shuffle files provided, skip calculation of IVF partition.");
-        if path.contains("://") {
-            return Err(Error::not_supported(
-                "legacy IVF shuffler does not support remote precomputed_shuffle_buffers; use the V3 vector index builder path instead".to_string(),
-            ));
-        }
-        let mut shuffler =
-            IvfShuffler::try_new(num_partitions, Some(Path::parse(&path)?), true, None)?;
+        let mut shuffler = IvfShuffler::try_new(num_partitions, Some(path), true, None)?;
         unsafe {
             shuffler.set_unsorted_buffers(&buffers);
         }
diff --git a/rust/lance-index/src/vector/v3/shuffler.rs b/rust/lance-index/src/vector/v3/shuffler.rs
index 45c719d523a..20bed4cdc23 100644
--- a/rust/lance-index/src/vector/v3/shuffler.rs
+++ b/rust/lance-index/src/vector/v3/shuffler.rs
@@ -4,7 +4,6 @@
 //! Shuffler is a component that takes a stream of record batches and shuffles them into
 //! the corresponding IVF partitions.
 
-use std::collections::HashMap;
 use std::ops::Range;
 use std::sync::atomic::AtomicU64;
 use std::sync::{Arc, Mutex};
@@ -37,13 +36,6 @@ use object_store::path::Path;
 
 use crate::vector::{LOSS_METADATA_KEY, PART_ID_COLUMN};
 
-const SHUFFLE_NUM_PARTITIONS_METADATA_KEY: &str = "lance:shuffle:num_partitions";
-const SHUFFLE_NUM_BATCHES_METADATA_KEY: &str = "lance:shuffle:num_batches";
-const SHUFFLE_PARTITION_COUNTS_METADATA_KEY: &str = "lance:shuffle:partition_counts";
-const SHUFFLE_TOTAL_LOSS_METADATA_KEY: &str = "lance:shuffle:total_loss";
-pub const SHUFFLE_DATA_FILE_NAME: &str = "shuffle_data.lance";
-pub const SHUFFLE_OFFSETS_FILE_NAME: &str = "shuffle_offsets.lance";
-
 #[async_trait::async_trait]
 /// A reader that can read the shuffled partitions.
 pub trait ShuffleReader: Send + Sync {
@@ -443,7 +435,7 @@ impl Shuffler for TwoFileShuffler {
         );
 
         // Create data file writer
-        let data_path = self.output_dir.child(SHUFFLE_DATA_FILE_NAME);
+        let data_path = self.output_dir.child("shuffle_data.lance");
         let spill_path = self.output_dir.child("shuffle_data.spill");
         let writer = self.object_store.create(&data_path).await?;
         let mut file_writer = FileWriter::try_new(
@@ -454,7 +446,7 @@ impl Shuffler for TwoFileShuffler {
         .with_page_metadata_spill(self.object_store.clone(), spill_path);
 
         // Create offsets file writer
-        let offsets_path = self.output_dir.child(SHUFFLE_OFFSETS_FILE_NAME);
+        let offsets_path = self.output_dir.child("shuffle_offsets.lance");
         let spill_path = self.output_dir.child("shuffle_offsets.spill");
         let writer = self.object_store.create(&offsets_path).await?;
         let mut offsets_writer = FileWriter::try_new(
@@ -535,37 +527,12 @@ impl Shuffler for TwoFileShuffler {
                 .await?;
         }
 
-        let partition_counts_json = serde_json::to_string(&partition_counts).map_err(|e| {
-            Error::invalid_input(format!("Failed to serialize shuffle partition counts: {e}"))
-        })?;
-        let num_partitions_str = num_partitions.to_string();
-        let num_batches_str = num_batches
-            .load(std::sync::atomic::Ordering::Relaxed)
-            .to_string();
-        let total_loss_str = total_loss.lock().unwrap().to_string();
-        for writer in [&mut file_writer, &mut offsets_writer] {
-            writer.add_schema_metadata(
-                SHUFFLE_NUM_PARTITIONS_METADATA_KEY,
-                num_partitions_str.clone(),
-            );
-            writer.add_schema_metadata(SHUFFLE_NUM_BATCHES_METADATA_KEY, num_batches_str.clone());
-            writer.add_schema_metadata(
-                SHUFFLE_PARTITION_COUNTS_METADATA_KEY,
-                partition_counts_json.clone(),
-            );
-            writer.add_schema_metadata(SHUFFLE_TOTAL_LOSS_METADATA_KEY, total_loss_str.clone());
-        }
-
         // Finish files
         file_writer.finish().await?;
         offsets_writer.finish().await?;
 
-        let num_batches = num_batches_str
-            .parse::<u64>()
-            .expect("num_batches string was produced from u64");
-        let total_loss_val = total_loss_str
-            .parse::<f64>()
-            .expect("total_loss string was produced from f64");
+        let num_batches = num_batches.load(std::sync::atomic::Ordering::Relaxed);
+        let total_loss_val = *total_loss.lock().unwrap();
 
         TwoFileShuffleReader::try_new(
             self.object_store.clone(),
@@ -590,46 +557,6 @@ pub struct TwoFileShuffleReader {
 }
 
 impl TwoFileShuffleReader {
-    pub async fn try_open_existing(
-        object_store: Arc<ObjectStore>,
-        output_dir: Path,
-        data_file: impl AsRef<str>,
-        offsets_file: impl AsRef<str>,
-    ) -> Result<Box<dyn ShuffleReader>> {
-        let scheduler_config = SchedulerConfig::max_bandwidth(&object_store);
-        let scheduler = ScanScheduler::new(object_store, scheduler_config);
-
-        let file_reader = FileReader::try_open(
-            scheduler
-                .open_file(
-                    &output_dir.child(data_file.as_ref()),
-                    &CachedFileSize::unknown(),
-                )
-                .await?,
-            None,
-            Arc::<DecoderPlugins>::default(),
-            &LanceCache::no_cache(),
-            FileReaderOptions::default(),
-        )
-        .await?;
-
-        let offsets_reader = FileReader::try_open(
-            scheduler
-                .open_file(
-                    &output_dir.child(offsets_file.as_ref()),
-                    &CachedFileSize::unknown(),
-                )
-                .await?,
-            None,
-            Arc::<DecoderPlugins>::default(),
-            &LanceCache::no_cache(),
-            FileReaderOptions::default(),
-        )
-        .await?;
-
-        Self::from_existing_readers(scheduler, file_reader, offsets_reader)
-    }
-
     async fn try_new(
         object_store: Arc<ObjectStore>,
         output_dir: Path,
@@ -645,7 +572,7 @@ impl TwoFileShuffleReader {
         let scheduler_config = SchedulerConfig::max_bandwidth(&object_store);
         let scheduler = ScanScheduler::new(object_store, scheduler_config);
 
-        let data_path = output_dir.child(SHUFFLE_DATA_FILE_NAME);
+        let data_path = output_dir.child("shuffle_data.lance");
         let file_reader = FileReader::try_open(
             scheduler
                 .open_file(&data_path, &CachedFileSize::unknown())
@@ -657,7 +584,7 @@ impl TwoFileShuffleReader {
         )
         .await?;
 
-        let offsets_path = output_dir.child(SHUFFLE_OFFSETS_FILE_NAME);
+        let offsets_path = output_dir.child("shuffle_offsets.lance");
         let offsets_reader = FileReader::try_open(
             scheduler
                 .open_file(&offsets_path, &CachedFileSize::unknown())
@@ -680,87 +607,6 @@ impl TwoFileShuffleReader {
         }))
     }
 
-    fn from_existing_readers(
-        scheduler: Arc<ScanScheduler>,
-        file_reader: FileReader,
-        offsets_reader: FileReader,
-    ) -> Result<Box<dyn ShuffleReader>> {
-        let metadata: &HashMap<String, String> = &offsets_reader.schema().metadata;
-
-        let num_partitions = metadata
-            .get(SHUFFLE_NUM_PARTITIONS_METADATA_KEY)
-            .ok_or_else(|| {
-                Error::invalid_input(format!(
-                    "Missing required metadata key {SHUFFLE_NUM_PARTITIONS_METADATA_KEY} in precomputed V3 shuffle offsets file"
-                ))
-            })?
-            .parse::<usize>()
-            .map_err(|e| {
-                Error::invalid_input(format!(
-                    "Invalid value for {SHUFFLE_NUM_PARTITIONS_METADATA_KEY}: {e}"
-                ))
-            })?;
-        let num_batches = metadata
-            .get(SHUFFLE_NUM_BATCHES_METADATA_KEY)
-            .ok_or_else(|| {
-                Error::invalid_input(format!(
-                    "Missing required metadata key {SHUFFLE_NUM_BATCHES_METADATA_KEY} in precomputed V3 shuffle offsets file"
-                ))
-            })?
-            .parse::<u64>()
-            .map_err(|e| {
-                Error::invalid_input(format!(
-                    "Invalid value for {SHUFFLE_NUM_BATCHES_METADATA_KEY}: {e}"
-                ))
-            })?;
-        let partition_counts = serde_json::from_str::<Vec<u64>>(
-            metadata
-                .get(SHUFFLE_PARTITION_COUNTS_METADATA_KEY)
-                .ok_or_else(|| {
-                    Error::invalid_input(format!(
-                        "Missing required metadata key {SHUFFLE_PARTITION_COUNTS_METADATA_KEY} in precomputed V3 shuffle offsets file"
-                    ))
-                })?,
-        )
-        .map_err(|e| {
-            Error::invalid_input(format!(
-                "Invalid value for {SHUFFLE_PARTITION_COUNTS_METADATA_KEY}: {e}"
-            ))
-        })?;
-        if partition_counts.len() != num_partitions {
-            return Err(Error::invalid_input(format!(
-                "Precomputed V3 shuffle partition count length {} does not match num_partitions {}",
-                partition_counts.len(),
-                num_partitions
-            )));
-        }
-        let total_loss = metadata
-            .get(SHUFFLE_TOTAL_LOSS_METADATA_KEY)
-            .map(|value| {
-                value.parse::<f64>().map_err(|e| {
-                    Error::invalid_input(format!(
-                        "Invalid value for {SHUFFLE_TOTAL_LOSS_METADATA_KEY}: {e}"
-                    ))
-                })
-            })
-            .transpose()?
-            .unwrap_or(0.0);
-
-        if num_batches == 0 {
-            return Ok(Box::new(EmptyReader));
-        }
-
-        Ok(Box::new(Self {
-            _scheduler: scheduler,
-            file_reader,
-            offsets_reader,
-            num_partitions,
-            num_batches,
-            partition_counts,
-            total_loss,
-        }))
-    }
-
     async fn partition_ranges(&self, partition_id: usize) -> Result<Vec<Range<u64>>> {
         let mut positions = Vec::with_capacity(self.num_batches as usize * 2);
         for batch_idx in 0..self.num_batches {
@@ -997,42 +843,6 @@ mod tests {
         assert!((loss - 4.25).abs() < 1e-10, "expected 4.25, got {}", loss);
     }
 
-    #[tokio::test]
-    async fn test_two_file_shuffler_reopen_existing_files() {
-        let dir = TempStrDir::default();
-        let output_dir = Path::from(dir.as_ref());
-        let num_partitions = 3;
-
-        let batch1 = make_batch(&[0, 1, 2], &[10, 20, 30], Some(1.5));
-        let batch2 = make_batch(&[2, 0, 1, 0], &[40, 50, 60, 70], Some(2.0));
-
-        let shuffler = TwoFileShuffler::new(output_dir.clone(), num_partitions);
-        let stream = batches_to_stream(vec![batch1, batch2]);
-        let _ = shuffler.shuffle(stream).await.unwrap();
-
-        let reopened = TwoFileShuffleReader::try_open_existing(
-            Arc::new(ObjectStore::local()),
-            output_dir,
-            SHUFFLE_DATA_FILE_NAME,
-            SHUFFLE_OFFSETS_FILE_NAME,
-        )
-        .await
-        .unwrap();
-
-        assert_eq!(reopened.partition_size(0).unwrap(), 3);
-        assert_eq!(reopened.partition_size(1).unwrap(), 2);
-        assert_eq!(reopened.partition_size(2).unwrap(), 2);
-
-        let p0 = collect_partition(reopened.as_ref(), 0).await.unwrap();
-        let vals: &Int32Array = p0.column_by_name("val").unwrap().as_primitive();
-        let mut v: Vec<i32> = vals.iter().map(|x| x.unwrap()).collect();
-        v.sort();
-        assert_eq!(v, vec![10, 50, 70]);
-
-        let loss = reopened.total_loss().unwrap();
-        assert!((loss - 3.5).abs() < 1e-10, "expected 3.5, got {}", loss);
-    }
-
     #[tokio::test]
     async fn test_two_file_shuffler_single_batch() {
         let dir = TempStrDir::default();
diff --git a/rust/lance/src/index/vector/builder.rs b/rust/lance/src/index/vector/builder.rs
index 3e85a9d40b3..0edfbea4812 100644
--- a/rust/lance/src/index/vector/builder.rs
+++ b/rust/lance/src/index/vector/builder.rs
@@ -3,7 +3,6 @@
 
 use std::cmp::Ordering;
 use std::collections::HashSet;
-use std::path::Path as StdPath;
 use std::sync::Arc;
 use std::{collections::HashMap, pin::Pin};
 
@@ -45,10 +44,7 @@ use lance_index::vector::quantizer::{QuantizerMetadata, QuantizerStorage};
 use lance_index::vector::shared::{SupportedIvfIndexType, write_unified_ivf_and_index_metadata};
 use lance_index::vector::storage::STORAGE_METADATA_KEY;
 use lance_index::vector::transform::Flatten;
-use lance_index::vector::v3::shuffler::{
-    EmptyReader, IvfShufflerReader, SHUFFLE_DATA_FILE_NAME, SHUFFLE_OFFSETS_FILE_NAME,
-    TwoFileShuffleReader,
-};
+use lance_index::vector::v3::shuffler::{EmptyReader, IvfShufflerReader};
 use lance_index::vector::v3::subindex::SubIndexType;
 use lance_index::vector::{LOSS_METADATA_KEY, PART_ID_COLUMN, PQ_CODE_COLUMN, VectorIndex};
 use lance_index::vector::{PART_ID_FIELD, ivf::storage::IvfModel};
@@ -71,9 +67,7 @@ use lance_index::{
     MIN_PARTITION_SIZE_PERCENT,
 };
 use lance_io::local::to_local_path;
-use lance_io::object_store::{
-    ObjectStore, ObjectStoreParams, ObjectStoreRegistry, StorageOptionsAccessor,
-};
+use lance_io::object_store::ObjectStore;
 use lance_io::stream::RecordBatchStream;
 use lance_io::stream::RecordBatchStreamAdapter;
 use lance_linalg::distance::{DistanceType, Dot, L2, Normalize};
@@ -85,7 +79,6 @@ use tracing::{Level, instrument, span};
 
 use crate::Dataset;
 use crate::dataset::ProjectionRequest;
-use crate::dataset::builder::DatasetBuilder;
 use crate::dataset::index::dataset_format_version;
 use crate::index::vector::ivf::v2::PartitionEntry;
 use crate::index::vector::utils::infer_vector_dim;
@@ -150,83 +143,6 @@ type BuildStream<S, Q> =
     Pin<Box<dyn Stream<Item = Result<Option<(<Q as Quantization>::Storage, S, f64)>>> + Send>>;
 
 impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q> {
-    fn precomputed_shuffle_buffers_uri(root: &str) -> String {
-        let uri = root.to_string();
-        if uri.contains("://") {
-            uri
-        } else {
-            to_local_path(&Path::from(root))
-        }
-    }
-
-    fn precomputed_shuffle_buffers_root_uri(root: &str) -> String {
-        let uri = Self::precomputed_shuffle_buffers_uri(root);
-        if uri.ends_with("/data") {
-            uri.trim_end_matches("/data").to_string()
-        } else {
-            uri
-        }
-    }
-
-    fn object_store_params(&self) -> ObjectStoreParams {
-        let mut params = ObjectStoreParams::default();
-        if let Some(storage_options) = self
-            .ivf_params
-            .as_ref()
-            .and_then(|params| params.storage_options.clone())
-        {
-            params.storage_options_accessor = Some(Arc::new(
-                StorageOptionsAccessor::with_static_options(storage_options),
-            ));
-        }
-        params
-    }
-
-    async fn try_open_precomputed_v3_shuffle_reader(
-        &self,
-        root: &str,
-        files: &[String],
-    ) -> Result<Option<Arc<dyn ShuffleReader>>> {
-        if files.len() != 2 {
-            return Ok(None);
-        }
-
-        let mut data_file = None;
-        let mut offsets_file = None;
-        for file in files {
-            let Some(file_name) = StdPath::new(file).file_name() else {
-                return Ok(None);
-            };
-            match file_name.to_string_lossy().as_ref() {
-                SHUFFLE_DATA_FILE_NAME => data_file = Some(SHUFFLE_DATA_FILE_NAME),
-                SHUFFLE_OFFSETS_FILE_NAME => offsets_file = Some(SHUFFLE_OFFSETS_FILE_NAME),
-                _ => return Ok(None),
-            }
-        }
-        let (Some(data_file), Some(offsets_file)) = (data_file, offsets_file) else {
-            return Ok(None);
-        };
-        let registry = Arc::new(ObjectStoreRegistry::default());
-        let params = self.object_store_params();
-        let (object_store, output_dir) = ObjectStore::from_uri_and_params(
-            registry,
-            &Self::precomputed_shuffle_buffers_root_uri(root),
-            &params,
-        )
-        .await?;
-
-        Ok(Some(
-            TwoFileShuffleReader::try_open_existing(
-                object_store,
-                output_dir,
-                data_file,
-                offsets_file,
-            )
-            .await?
-            .into(),
-        ))
-    }
-
     async fn try_open_precomputed_partition_artifact_reader(
         &self,
         uri: &str,
@@ -644,38 +560,11 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q>
             .as_ref()
             .and_then(|p| p.precomputed_shuffle_buffers.as_ref())
         {
-            Some((uri, files)) => {
-                if let Some(reader) = self
-                    .try_open_precomputed_v3_shuffle_reader(uri, files)
-                    .await?
-                {
-                    log::info!("shuffle with precomputed V3 shuffle files from {}", uri);
-                    self.shuffle_reader = Some(reader);
-                    return Ok(());
-                }
-
-                let uri = Self::precomputed_shuffle_buffers_root_uri(uri);
-                let uri = if StdPath::new(&uri)
-                    .file_name()
-                    .is_some_and(|name| name == "data")
-                {
-                    StdPath::new(&uri)
-                        .parent()
-                        .map(|path| path.to_string_lossy().to_string())
-                        .unwrap_or(uri)
-                } else {
-                    uri
-                };
+            Some((uri, _)) => {
+                let uri = to_local_path(uri);
+                let uri = uri.trim_end_matches("data");
                 log::info!("shuffle with precomputed shuffle buffers from {}", uri);
-                let mut builder = DatasetBuilder::from_uri(&uri);
-                if let Some(storage_options) = self
-                    .ivf_params
-                    .as_ref()
-                    .and_then(|params| params.storage_options.clone())
-                {
-                    builder = builder.with_storage_options(storage_options);
-                }
-                let ds = builder.load().await?;
+                let ds = Dataset::open(uri).await?;
                 ds.scan().try_into_stream().await?
             }
             _ => {
@@ -2524,14 +2413,4 @@ mod tests {
         let row_ids = batches[0][ROW_ID].as_primitive::<UInt64Type>();
         assert_eq!(row_ids.values(), &[4, 3, 2, 1, 0]);
     }
-
-    #[test]
-    fn precomputed_shuffle_buffer_uri_preserves_remote_uri() {
-        assert_eq!(
-            IvfIndexBuilder::<FlatIndex, FlatQuantizer>::precomputed_shuffle_buffers_root_uri(
-                "s3://bucket/shuffle"
-            ),
-            "s3://bucket/shuffle"
-        );
-    }
 }
diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs
index 229d47f229b..34ce23f1eac 100644
--- a/rust/lance/src/index/vector/ivf.rs
+++ b/rust/lance/src/index/vector/ivf.rs
@@ -1818,7 +1818,7 @@ async fn write_ivf_pq_file(
     precomputed_partitions: Option<HashMap<u64, u32>>,
     shuffle_partition_batches: usize,
     shuffle_partition_concurrency: usize,
-    precomputed_shuffle_buffers: Option<(String, Vec<String>)>,
+    precomputed_shuffle_buffers: Option<(Path, Vec<String>)>,
 ) -> Result<()> {
     let path = index_dir.child(uuid).child(INDEX_FILE_NAME);
     let mut writer = object_store.create(&path).await?;
@@ -1911,7 +1911,7 @@ async fn write_ivf_hnsw_file(
     precomputed_partitions: Option<HashMap<u64, u32>>,
     shuffle_partition_batches: usize,
     shuffle_partition_concurrency: usize,
-    precomputed_shuffle_buffers: Option<(String, Vec<String>)>,
+    precomputed_shuffle_buffers: Option<(Path, Vec<String>)>,
 ) -> Result<()> {
     let object_store = dataset.object_store();
     let path = dataset.indices_dir().child(uuid).child(INDEX_FILE_NAME);
diff --git a/rust/lance/src/index/vector/ivf/builder.rs b/rust/lance/src/index/vector/ivf/builder.rs
index bcd47ae4057..9bd1ba95803 100644
--- a/rust/lance/src/index/vector/ivf/builder.rs
+++ b/rust/lance/src/index/vector/ivf/builder.rs
@@ -22,6 +22,7 @@ use lance_index::vector::{ivf::storage::IvfModel, transform::Transformer};
 use lance_io::stream::RecordBatchStreamAdapter;
 use lance_table::io::manifest::ManifestDescribing;
 use log::info;
+use object_store::path::Path;
 use tracing::instrument;
 
 use lance_core::{Error, ROW_ID, Result, traits::DatasetTakeRows};
@@ -54,7 +55,7 @@ pub(super) async fn build_partitions(
     precomputed_partitions: Option<HashMap<u64, u32>>,
     shuffle_partition_batches: usize,
     shuffle_partition_concurrency: usize,
-    precomputed_shuffle_buffers: Option<(String, Vec<String>)>,
+    precomputed_shuffle_buffers: Option<(Path, Vec<String>)>,
 ) -> Result<()> {
     let schema = data.schema();
     if schema.column_with_name(column).is_none() {
@@ -253,7 +254,7 @@ pub(super) async fn build_hnsw_partitions(
     precomputed_partitions: Option<HashMap<u64, u32>>,
     shuffle_partition_batches: usize,
     shuffle_partition_concurrency: usize,
-    precomputed_shuffle_buffers: Option<(String, Vec<String>)>,
+    precomputed_shuffle_buffers: Option<(Path, Vec<String>)>,
 ) -> Result<(Vec<HnswMetadata>, IvfModel)> {
     let schema = data.schema();
     if schema.column_with_name(column).is_none() {
diff --git a/rust/lance/src/index/vector/utils.rs b/rust/lance/src/index/vector/utils.rs
index 244a02c39bc..19156ac8eed 100644
--- a/rust/lance/src/index/vector/utils.rs
+++ b/rust/lance/src/index/vector/utils.rs
@@ -372,7 +372,7 @@ impl PartitionLoadLock {
 ///
 /// Handles both regular vector columns (FixedSizeList) and multivector columns
 /// (List\<FixedSizeList\>), flattening the latter.
-pub fn vector_column_to_fsl(batch: &RecordBatch, column: &str) -> Result<FixedSizeListArray> {
+fn vector_column_to_fsl(batch: &RecordBatch, column: &str) -> Result<FixedSizeListArray> {
     let array = get_column_from_batch(batch, column)?;
     match array.data_type() {
         arrow::datatypes::DataType::FixedSizeList(_, _) => Ok(array.as_fixed_size_list().clone()),

From 51a141bd947533dff1659c6ea080197c164a5385 Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Thu, 9 Apr 2026 00:33:54 +0800
Subject: [PATCH 20/21] docs: document partition artifact internals

---
 .../src/index/vector/partition_artifact.rs    | 104 ++++++++++++++++++
 1 file changed, 104 insertions(+)

diff --git a/rust/lance/src/index/vector/partition_artifact.rs b/rust/lance/src/index/vector/partition_artifact.rs
index cfd2a0f2b4a..fe585f513be 100644
--- a/rust/lance/src/index/vector/partition_artifact.rs
+++ b/rust/lance/src/index/vector/partition_artifact.rs
@@ -39,6 +39,11 @@ const PARTITION_ARTIFACT_BUCKET_PREFIX: &str = "bucket-";
 const PARTITION_ARTIFACT_FILE_VERSION: &str = "2.2";
 const PARTITION_ARTIFACT_BUCKET_BUFFER_ROWS: usize = 32 * 1024;
 
+/// Top-level manifest for a precomputed partition artifact.
+///
+/// The manifest is intentionally small and JSON-encoded so an external backend
+/// can materialize partition data once and Lance can reopen it later without
+/// understanding any backend-specific details.
 #[derive(Debug, Serialize, Deserialize)]
 struct PartitionArtifactManifest {
     version: u32,
@@ -50,6 +55,11 @@ struct PartitionArtifactManifest {
     partitions: Vec<PartitionArtifactPartition>,
 }
 
+/// Describes where one logical IVF partition lives inside the artifact.
+///
+/// Multiple logical partitions can share the same physical file when they hash
+/// to the same bucket. `ranges` records the row spans within that file that
+/// belong to this partition.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 struct PartitionArtifactPartition {
     #[serde(default)]
@@ -60,12 +70,22 @@ struct PartitionArtifactPartition {
     ranges: Vec<PartitionArtifactRange>,
 }
 
+/// A contiguous row range for a partition inside one bucket file.
+///
+/// The builder sorts each finalized bucket by partition id, so a partition is
+/// usually represented by a single range. The type still allows multiple runs
+/// so the reader does not depend on that implementation detail.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 struct PartitionArtifactRange {
     offset: u64,
     num_rows: u64,
 }
 
+/// In-memory staging buffer for one bucket before it is flushed to disk.
+///
+/// Batches arrive grouped arbitrarily by the backend. The builder first
+/// appends rows into per-bucket buffers so it can write larger sequential runs
+/// to temporary files instead of issuing tiny file writes.
 #[derive(Default, Debug)]
 struct BucketBuffer {
     row_ids: Vec<u64>,
@@ -74,15 +94,26 @@ struct BucketBuffer {
 }
 
 impl BucketBuffer {
+    /// Number of staged rows currently buffered for this bucket.
     fn len(&self) -> usize {
         self.row_ids.len()
     }
 
+    /// Whether the bucket currently has any staged rows.
     fn is_empty(&self) -> bool {
         self.row_ids.is_empty()
     }
 }
 
+/// Writes partition-addressable encoded rows for a later Lance finalization.
+///
+/// The builder uses a two-phase layout:
+/// 1. Append arbitrary input batches into temporary bucket files.
+/// 2. Reopen each bucket, sort rows by partition id, and rewrite one finalized
+///    bucket file plus a compact manifest that records per-partition ranges.
+///
+/// This keeps the write path sequential and bounded in memory while still
+/// giving the finalizer efficient partition reads.
 pub struct PartitionArtifactBuilder {
     object_store: Arc<ObjectStore>,
     root_dir: Path,
@@ -96,6 +127,11 @@ pub struct PartitionArtifactBuilder {
 }
 
 impl PartitionArtifactBuilder {
+    /// Create a builder from a URI and optional storage options.
+    ///
+    /// This is the external entry point used by backends that only know an
+    /// artifact URI. It resolves the object store and then delegates to the
+    /// store-aware constructor.
     pub async fn try_new(
         uri: &str,
         num_partitions: usize,
@@ -120,6 +156,12 @@ impl PartitionArtifactBuilder {
         Self::try_new_with_store(object_store, root_dir, num_partitions, pq_code_width)
     }
 
+    /// Create a builder against an already-resolved object store.
+    ///
+    /// The builder precomputes the temporary and final schemas and allocates
+    /// one staging buffer per bucket. Buckets are a write-time sharding scheme:
+    /// they are not visible to readers, but they keep memory usage bounded and
+    /// avoid one file per partition.
     pub fn try_new_with_store(
         object_store: Arc<ObjectStore>,
         root_dir: Path,
@@ -177,6 +219,11 @@ impl PartitionArtifactBuilder {
         })
     }
 
+    /// Append one encoded batch into the artifact staging area.
+    ///
+    /// Input batches must already contain row ids, partition ids, and PQ codes.
+    /// Rows are redistributed into bucket-local in-memory buffers and flushed to
+    /// temporary files once they become large enough.
     pub async fn append_batch(&mut self, batch: &RecordBatch) -> Result<()> {
         validate_input_batch(batch, self.pq_code_width)?;
 
@@ -210,6 +257,11 @@ impl PartitionArtifactBuilder {
         Ok(())
     }
 
+    /// Finalize the artifact and return the relative files that were created.
+    ///
+    /// Finalization flushes all remaining staging buffers, rewrites each bucket
+    /// into its final sorted form, and emits a manifest that lets Lance reopen
+    /// the artifact as a [`ShuffleReader`].
     pub async fn finish(
         &mut self,
         metadata_file: &str,
@@ -259,6 +311,12 @@ impl PartitionArtifactBuilder {
         Ok(files)
     }
 
+    /// Flush the current in-memory buffer for one bucket into its temporary
+    /// file.
+    ///
+    /// Temporary files preserve the original row order inside the bucket. The
+    /// expensive partition sort is deferred to `finalize_bucket`, so append-time
+    /// stays cheap.
     async fn flush_bucket(&mut self, bucket_id: usize) -> Result<()> {
         if self.buffers[bucket_id].is_empty() {
             return Ok(());
@@ -270,6 +328,8 @@ impl PartitionArtifactBuilder {
         Ok(())
     }
 
+    /// Convert a bucket's staged vectors into a temporary batch and empty the
+    /// in-memory buffer.
     fn take_temp_batch(&mut self, bucket_id: usize) -> Result<RecordBatch> {
         let buffer = &mut self.buffers[bucket_id];
         let row_ids = UInt64Array::from(mem::take(&mut buffer.row_ids));
@@ -284,6 +344,10 @@ impl PartitionArtifactBuilder {
         .map_err(Error::from)
     }
 
+    /// Lazily create the temporary writer for a bucket.
+    ///
+    /// Buckets that never receive rows never create a file, which keeps sparse
+    /// artifacts compact.
     async fn ensure_temp_writer(&mut self, bucket_id: usize) -> Result<&mut FileWriter> {
         if self.temp_writers[bucket_id].is_none() {
             let path = self.temp_bucket_path(bucket_id);
@@ -299,6 +363,12 @@ impl PartitionArtifactBuilder {
             .expect("temp writer initialized"))
     }
 
+    /// Rewrite one temporary bucket into its final on-disk representation.
+    ///
+    /// All rows for the bucket are loaded, sorted by partition id, and written
+    /// to a single final bucket file that stores only the row id and PQ code.
+    /// The manifest is updated with the row ranges for each partition contained
+    /// in this bucket.
     async fn finalize_bucket(
         &self,
         bucket_id: usize,
@@ -412,6 +482,7 @@ impl PartitionArtifactBuilder {
         Ok(Some(final_relative_path))
     }
 
+    /// Path of the temporary file used while accumulating one bucket.
     fn temp_bucket_path(&self, bucket_id: usize) -> Path {
         self.root_dir
             .child(PARTITION_ARTIFACT_PARTITIONS_DIR)
@@ -420,6 +491,7 @@ impl PartitionArtifactBuilder {
             ))
     }
 
+    /// Path of the finalized file for one bucket.
     fn final_bucket_path(&self, bucket_id: usize) -> Path {
         self.root_dir
             .child(PARTITION_ARTIFACT_PARTITIONS_DIR)
@@ -428,6 +500,7 @@ impl PartitionArtifactBuilder {
             ))
     }
 
+    /// Relative path recorded in the manifest for one finalized bucket.
     fn final_bucket_relative_path(&self, bucket_id: usize) -> String {
         format!(
             "{PARTITION_ARTIFACT_PARTITIONS_DIR}/{PARTITION_ARTIFACT_BUCKET_PREFIX}{bucket_id:05}.lance"
@@ -435,6 +508,11 @@ impl PartitionArtifactBuilder {
     }
 }
 
+/// Reopens a partition artifact as a `ShuffleReader`.
+///
+/// The final Lance builder consumes artifacts through the generic
+/// [`ShuffleReader`] interface, so this adapter hides the manifest parsing and
+/// file caching needed to expose partition-local record batch streams.
 #[derive(Debug)]
 pub(crate) struct PartitionArtifactShuffleReader {
     scheduler: Arc<ScanScheduler>,
@@ -444,6 +522,10 @@ pub(crate) struct PartitionArtifactShuffleReader {
     file_readers: Mutex<HashMap<String, Arc<FileReader>>>,
 }
 
+/// Writer options for all files stored inside a partition artifact.
+///
+/// The artifact uses a fixed file version so external backends and Lance
+/// finalization agree on the on-disk layout.
 fn file_writer_options() -> Result<FileWriterOptions> {
     Ok(FileWriterOptions {
         format_version: Some(
@@ -460,6 +542,10 @@ fn file_writer_options() -> Result<FileWriterOptions> {
     })
 }
 
+/// Validate that a backend-produced batch matches the artifact contract.
+///
+/// The builder is intentionally strict here because any schema drift would only
+/// surface much later during finalization.
 fn validate_input_batch(batch: &RecordBatch, pq_code_width: usize) -> Result<()> {
     let Some(row_ids) = batch.column_by_name(ROW_ID) else {
         return Err(Error::invalid_input(format!(
@@ -497,6 +583,7 @@ fn validate_input_batch(batch: &RecordBatch, pq_code_width: usize) -> Result<()>
     }
 }
 
+/// Serialize a small JSON sidecar directly into the object store.
 async fn write_json<T: Serialize>(
     object_store: &ObjectStore,
     path: &Path,
@@ -515,6 +602,7 @@ async fn write_json<T: Serialize>(
 }
 
 impl PartitionArtifactShuffleReader {
+    /// Open an artifact reader from a URI and optional storage options.
     pub(crate) async fn try_open(
         uri: &str,
         storage_options: Option<&HashMap<String, String>>,
@@ -537,6 +625,10 @@ impl PartitionArtifactShuffleReader {
         Self::try_open_with_store(object_store, root_dir).await
     }
 
+    /// Open an artifact reader once the object store has already been resolved.
+    ///
+    /// This reads the manifest once, validates it, and initializes the shared
+    /// scheduler and reader cache used by partition reads.
     async fn try_open_with_store(object_store: Arc<ObjectStore>, root_dir: Path) -> Result<Self> {
         let manifest_path = root_dir.child("manifest.json");
         let manifest_bytes = object_store.read_one_all(&manifest_path).await?;
@@ -574,6 +666,10 @@ impl PartitionArtifactShuffleReader {
         })
     }
 
+    /// Open and cache a file reader for a finalized bucket file.
+    ///
+    /// Multiple logical partitions can point at the same bucket file, so the
+    /// reader cache prevents redundant file opens during finalization.
     async fn open_file_reader(&self, relative_path: &str) -> Result<Arc<FileReader>> {
         if let Some(reader) = self
             .file_readers
@@ -606,6 +702,7 @@ impl PartitionArtifactShuffleReader {
     }
 }
 
+/// Join a manifest-relative path onto the artifact root.
 fn join_relative_path(root_dir: &Path, relative_path: &str) -> Path {
     relative_path
         .split('/')
@@ -615,6 +712,11 @@ fn join_relative_path(root_dir: &Path, relative_path: &str) -> Path {
 
 #[async_trait::async_trait]
 impl ShuffleReader for PartitionArtifactShuffleReader {
+    /// Return a stream over all rows belonging to one logical partition.
+    ///
+    /// The manifest already records the precise row ranges for each partition,
+    /// so the reader can issue targeted range reads without scanning unrelated
+    /// partitions.
     async fn read_partition(
         &self,
         partition_id: usize,
@@ -659,6 +761,7 @@ impl ShuffleReader for PartitionArtifactShuffleReader {
         ))))
     }
 
+    /// Number of encoded rows available for one logical partition.
     fn partition_size(&self, partition_id: usize) -> Result<usize> {
         Ok(self
             .partitions
@@ -667,6 +770,7 @@ impl ShuffleReader for PartitionArtifactShuffleReader {
             .unwrap_or(0))
     }
 
+    /// Optional training loss propagated from the backend into the artifact.
     fn total_loss(&self) -> Option<f64> {
         self.total_loss
     }

From 15e42fddf303aa8baaea5399cbeaefcead974b3b Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Thu, 9 Apr 2026 15:03:51 +0800
Subject: [PATCH 21/21] fix: stream partition artifact writes

---
 python/python/lance/cuvs.py                   |   5 +-
 python/python/lance/dataset.py                |   1 +
 python/python/tests/test_vector_index.py      |   3 +
 .../src/index/vector/partition_artifact.rs    | 326 ++++++++----------
 4 files changed, 145 insertions(+), 190 deletions(-)

diff --git a/python/python/lance/cuvs.py b/python/python/lance/cuvs.py
index ba7a1a67738..6bc8dbd5312 100644
--- a/python/python/lance/cuvs.py
+++ b/python/python/lance/cuvs.py
@@ -6,7 +6,7 @@
 import os
 import tempfile
 from importlib import import_module
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 
 if TYPE_CHECKING:
     from pathlib import Path
@@ -34,6 +34,7 @@ def build_vector_index_on_cuvs(
     num_partitions: int,
     num_sub_vectors: int,
     dst_dataset_uri: str | Path | None = None,
+    storage_options: Optional[dict[str, str]] = None,
     *,
     sample_rate: int = 256,
     max_iters: int = 50,
@@ -60,6 +61,7 @@ def build_vector_index_on_cuvs(
         max_iters=max_iters,
         num_bits=num_bits,
         filter_nan=filter_nan,
+        storage_options=storage_options,
     )
     artifact = backend.build_ivf_pq_artifact(
         dataset.uri,
@@ -68,6 +70,7 @@ def build_vector_index_on_cuvs(
         artifact_uri=artifact_uri,
         batch_size=batch_size,
         filter_nan=filter_nan,
+        storage_options=storage_options,
     )
     return (
         artifact.artifact_uri,
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
index 316f18d642f..c786b8f7cce 100644
--- a/python/python/lance/dataset.py
+++ b/python/python/lance/dataset.py
@@ -2987,6 +2987,7 @@ def _create_index_impl(
                     accelerator,
                     num_partitions,
                     num_sub_vectors,
+                    storage_options=storage_options,
                     sample_rate=kwargs.get("sample_rate", 256),
                     max_iters=kwargs.get("max_iters", 50),
                     num_bits=kwargs.get("num_bits", 8),
diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py
index 486e670f123..f6470a5bd8e 100644
--- a/python/python/tests/test_vector_index.py
+++ b/python/python/tests/test_vector_index.py
@@ -682,6 +682,7 @@ def build_ivf_pq_artifact(self, dataset_uri, column, **kwargs):
             4,
             16,
             dst_dataset_uri=tmp_path / "artifact_root",
+            storage_options={"region": "us-east-1"},
             sample_rate=7,
             max_iters=20,
             num_bits=4,
@@ -700,6 +701,7 @@ def build_ivf_pq_artifact(self, dataset_uri, column, **kwargs):
         "max_iters": 20,
         "num_bits": 4,
         "filter_nan": False,
+        "storage_options": {"region": "us-east-1"},
     }
     assert calls["build"]["dataset_uri"] == ds.uri
     assert calls["build"]["column"] == "vector"
@@ -707,6 +709,7 @@ def build_ivf_pq_artifact(self, dataset_uri, column, **kwargs):
     assert calls["build"]["artifact_uri"] == str(tmp_path / "artifact_root")
     assert calls["build"]["batch_size"] == 4096
     assert calls["build"]["filter_nan"] is False
+    assert calls["build"]["storage_options"] == {"region": "us-east-1"}
     assert artifact_uri == str(tmp_path / "artifact")
     assert files == [str(tmp_path / "artifact" / "data.lance")]
     assert ivf_centroids.equals(training.ivf_centroids())
diff --git a/rust/lance/src/index/vector/partition_artifact.rs b/rust/lance/src/index/vector/partition_artifact.rs
index fe585f513be..a721437358d 100644
--- a/rust/lance/src/index/vector/partition_artifact.rs
+++ b/rust/lance/src/index/vector/partition_artifact.rs
@@ -7,9 +7,8 @@ use std::ops::Range;
 use std::sync::{Arc, Mutex};
 
 use arrow_array::cast::AsArray;
-use arrow_array::{FixedSizeListArray, RecordBatch, UInt8Array, UInt32Array, UInt64Array};
+use arrow_array::{FixedSizeListArray, RecordBatch, UInt8Array, UInt64Array};
 use arrow_schema::{DataType, Field, Schema as ArrowSchema};
-use futures::TryStreamExt;
 use lance_arrow::FixedSizeListArrayExt;
 use lance_core::cache::LanceCache;
 use lance_core::datatypes::Schema;
@@ -34,7 +33,6 @@ const PARTITION_ARTIFACT_MANIFEST_VERSION: u32 = 1;
 const PARTITION_ARTIFACT_MANIFEST_FILE_NAME: &str = "manifest.json";
 const PARTITION_ARTIFACT_PARTITIONS_DIR: &str = "partitions";
 const PARTITION_ARTIFACT_DEFAULT_BUCKETS: usize = 256;
-const PARTITION_ARTIFACT_STAGING_PREFIX: &str = ".staging-bucket-";
 const PARTITION_ARTIFACT_BUCKET_PREFIX: &str = "bucket-";
 const PARTITION_ARTIFACT_FILE_VERSION: &str = "2.2";
 const PARTITION_ARTIFACT_BUCKET_BUFFER_ROWS: usize = 32 * 1024;
@@ -107,23 +105,22 @@ impl BucketBuffer {
 
 /// Writes partition-addressable encoded rows for a later Lance finalization.
 ///
-/// The builder uses a two-phase layout:
-/// 1. Append arbitrary input batches into temporary bucket files.
-/// 2. Reopen each bucket, sort rows by partition id, and rewrite one finalized
-///    bucket file plus a compact manifest that records per-partition ranges.
-///
-/// This keeps the write path sequential and bounded in memory while still
-/// giving the finalizer efficient partition reads.
+/// The builder uses bucket-local buffering to keep append-time memory bounded.
+/// Each flush sorts only the current in-memory bucket and appends it directly to
+/// the finalized bucket file, while the manifest accumulates per-partition row
+/// ranges. This keeps the writer streaming and avoids a full read/sort/rewrite
+/// pass at `finish()` time.
 pub struct PartitionArtifactBuilder {
     object_store: Arc<ObjectStore>,
     root_dir: Path,
     num_partitions: usize,
     num_buckets: usize,
     pq_code_width: usize,
-    temp_schema: Arc<ArrowSchema>,
     final_schema: Arc<ArrowSchema>,
-    temp_writers: Vec<Option<FileWriter>>,
+    final_writers: Vec<Option<FileWriter>>,
     buffers: Vec<BucketBuffer>,
+    partitions: Vec<PartitionArtifactPartition>,
+    bucket_row_counts: Vec<u64>,
 }
 
 impl PartitionArtifactBuilder {
@@ -158,10 +155,10 @@ impl PartitionArtifactBuilder {
 
     /// Create a builder against an already-resolved object store.
     ///
-    /// The builder precomputes the temporary and final schemas and allocates
-    /// one staging buffer per bucket. Buckets are a write-time sharding scheme:
-    /// they are not visible to readers, but they keep memory usage bounded and
-    /// avoid one file per partition.
+    /// The builder precomputes the final schema and allocates one staging
+    /// buffer per bucket. Buckets are a write-time sharding scheme: they are
+    /// not visible to readers, but they keep memory usage bounded and avoid one
+    /// file per partition.
     pub fn try_new_with_store(
         object_store: Arc<ObjectStore>,
         root_dir: Path,
@@ -182,18 +179,6 @@ impl PartitionArtifactBuilder {
         let num_buckets = num_partitions
             .min(PARTITION_ARTIFACT_DEFAULT_BUCKETS)
             .max(1);
-        let temp_schema = Arc::new(ArrowSchema::new(vec![
-            Field::new(ROW_ID, DataType::UInt64, false),
-            Field::new(PART_ID_COLUMN, DataType::UInt32, false),
-            Field::new(
-                PQ_CODE_COLUMN,
-                DataType::FixedSizeList(
-                    Arc::new(Field::new("item", DataType::UInt8, true)),
-                    pq_code_width as i32,
-                ),
-                true,
-            ),
-        ]));
         let final_schema = Arc::new(ArrowSchema::new(vec![
             Field::new(ROW_ID, DataType::UInt64, false),
             Field::new(
@@ -212,10 +197,18 @@ impl PartitionArtifactBuilder {
             num_partitions,
             num_buckets,
             pq_code_width,
-            temp_schema,
             final_schema,
-            temp_writers: (0..num_buckets).map(|_| None).collect(),
+            final_writers: (0..num_buckets).map(|_| None).collect(),
             buffers: (0..num_buckets).map(|_| BucketBuffer::default()).collect(),
+            partitions: vec![
+                PartitionArtifactPartition {
+                    path: None,
+                    num_rows: 0,
+                    ranges: Vec::new(),
+                };
+                num_partitions
+            ],
+            bucket_row_counts: vec![0; num_buckets],
         })
     }
 
@@ -259,9 +252,9 @@ impl PartitionArtifactBuilder {
 
     /// Finalize the artifact and return the relative files that were created.
     ///
-    /// Finalization flushes all remaining staging buffers, rewrites each bucket
-    /// into its final sorted form, and emits a manifest that lets Lance reopen
-    /// the artifact as a [`ShuffleReader`].
+    /// Finalization only needs to flush the remaining in-memory buffers and
+    /// persist the manifest because bucket files are already in their final
+    /// layout.
     pub async fn finish(
         &mut self,
         metadata_file: &str,
@@ -270,25 +263,16 @@ impl PartitionArtifactBuilder {
         for bucket_id in 0..self.num_buckets {
             self.flush_bucket(bucket_id).await?;
         }
-        for writer in self.temp_writers.iter_mut() {
+        for writer in self.final_writers.iter_mut() {
             if let Some(writer) = writer.as_mut() {
                 writer.finish().await?;
             }
         }
 
-        let mut partitions = vec![
-            PartitionArtifactPartition {
-                path: None,
-                num_rows: 0,
-                ranges: Vec::new(),
-            };
-            self.num_partitions
-        ];
         let mut artifact_files = Vec::with_capacity(self.num_buckets + 1);
-
         for bucket_id in 0..self.num_buckets {
-            if let Some(relative_path) = self.finalize_bucket(bucket_id, &mut partitions).await? {
-                artifact_files.push(relative_path);
+            if self.final_writers[bucket_id].is_some() {
+                artifact_files.push(self.final_bucket_relative_path(bucket_id));
             }
         }
 
@@ -297,7 +281,7 @@ impl PartitionArtifactBuilder {
             num_partitions: self.num_partitions,
             metadata_file: Some(metadata_file.to_string()),
             total_loss,
-            partitions,
+            partitions: self.partitions.clone(),
         };
         write_json(
             self.object_store.as_ref(),
@@ -311,153 +295,39 @@ impl PartitionArtifactBuilder {
         Ok(files)
     }
 
-    /// Flush the current in-memory buffer for one bucket into its temporary
-    /// file.
+    /// Flush the current in-memory buffer for one bucket into its finalized
+    /// bucket file.
     ///
-    /// Temporary files preserve the original row order inside the bucket. The
-    /// expensive partition sort is deferred to `finalize_bucket`, so append-time
-    /// stays cheap.
+    /// Each flush sorts only the buffered rows for this bucket and appends them
+    /// to the final file while recording new manifest ranges for the affected
+    /// partitions.
     async fn flush_bucket(&mut self, bucket_id: usize) -> Result<()> {
         if self.buffers[bucket_id].is_empty() {
             return Ok(());
         }
 
-        let batch = self.take_temp_batch(bucket_id)?;
-        let writer = self.ensure_temp_writer(bucket_id).await?;
-        writer.write_batch(&batch).await?;
-        Ok(())
-    }
-
-    /// Convert a bucket's staged vectors into a temporary batch and empty the
-    /// in-memory buffer.
-    fn take_temp_batch(&mut self, bucket_id: usize) -> Result<RecordBatch> {
         let buffer = &mut self.buffers[bucket_id];
         let row_ids = UInt64Array::from(mem::take(&mut buffer.row_ids));
-        let part_ids = UInt32Array::from(mem::take(&mut buffer.partition_ids));
+        let part_ids = mem::take(&mut buffer.partition_ids);
         let pq_values = UInt8Array::from(mem::take(&mut buffer.pq_values));
-        let pq_codes =
-            FixedSizeListArray::try_new_from_values(pq_values, self.pq_code_width as i32)?;
-        RecordBatch::try_new(
-            self.temp_schema.clone(),
-            vec![Arc::new(row_ids), Arc::new(part_ids), Arc::new(pq_codes)],
-        )
-        .map_err(Error::from)
-    }
-
-    /// Lazily create the temporary writer for a bucket.
-    ///
-    /// Buckets that never receive rows never create a file, which keeps sparse
-    /// artifacts compact.
-    async fn ensure_temp_writer(&mut self, bucket_id: usize) -> Result<&mut FileWriter> {
-        if self.temp_writers[bucket_id].is_none() {
-            let path = self.temp_bucket_path(bucket_id);
-            let writer = FileWriter::try_new(
-                self.object_store.create(&path).await?,
-                Schema::try_from(self.temp_schema.as_ref())?,
-                file_writer_options()?,
-            )?;
-            self.temp_writers[bucket_id] = Some(writer);
-        }
-        Ok(self.temp_writers[bucket_id]
-            .as_mut()
-            .expect("temp writer initialized"))
-    }
-
-    /// Rewrite one temporary bucket into its final on-disk representation.
-    ///
-    /// All rows for the bucket are loaded, sorted by partition id, and written
-    /// to a single final bucket file that stores only the row id and PQ code.
-    /// The manifest is updated with the row ranges for each partition contained
-    /// in this bucket.
-    async fn finalize_bucket(
-        &self,
-        bucket_id: usize,
-        partitions: &mut [PartitionArtifactPartition],
-    ) -> Result<Option<String>> {
-        let temp_path = self.temp_bucket_path(bucket_id);
-        if !self.object_store.exists(&temp_path).await? {
-            return Ok(None);
-        }
-
-        let reader = FileReader::try_open(
-            ScanScheduler::new(
-                self.object_store.clone(),
-                SchedulerConfig::max_bandwidth(&self.object_store),
-            )
-            .open_file(&temp_path, &CachedFileSize::unknown())
-            .await?,
-            None,
-            Arc::<DecoderPlugins>::default(),
-            &LanceCache::no_cache(),
-            FileReaderOptions::default(),
-        )
-        .await?;
-
-        let batches = reader
-            .read_stream(
-                ReadBatchParams::RangeFull,
-                u32::MAX,
-                16,
-                FilterExpression::no_filter(),
-            )?
-            .try_collect::<Vec<_>>()
-            .await?;
-        let total_rows = batches.iter().map(|batch| batch.num_rows()).sum::<usize>();
-        if total_rows == 0 {
-            self.object_store.delete(&temp_path).await?;
-            return Ok(None);
-        }
-
-        let mut row_ids = Vec::with_capacity(total_rows);
-        let mut partition_ids = Vec::with_capacity(total_rows);
-        let mut pq_values = Vec::with_capacity(total_rows * self.pq_code_width);
-        for batch in batches {
-            let batch_row_ids = batch[ROW_ID].as_primitive::<arrow::datatypes::UInt64Type>();
-            let batch_partition_ids =
-                batch[PART_ID_COLUMN].as_primitive::<arrow::datatypes::UInt32Type>();
-            let batch_pq = batch[PQ_CODE_COLUMN].as_fixed_size_list();
-            let batch_pq_values = batch_pq
-                .values()
-                .as_primitive::<arrow::datatypes::UInt8Type>();
-            row_ids.extend(batch_row_ids.values().iter().copied());
-            partition_ids.extend(batch_partition_ids.values().iter().copied());
-            pq_values.extend_from_slice(batch_pq_values.values().as_ref());
-        }
+        let total_rows = row_ids.len();
 
         let mut permutation = (0..total_rows).collect::<Vec<_>>();
-        permutation.sort_unstable_by_key(|&idx| partition_ids[idx]);
+        permutation.sort_unstable_by_key(|&idx| part_ids[idx]);
 
         let mut sorted_row_ids = Vec::with_capacity(total_rows);
         let mut sorted_partition_ids = Vec::with_capacity(total_rows);
         let mut sorted_pq_values = Vec::with_capacity(total_rows * self.pq_code_width);
         for idx in permutation {
-            sorted_row_ids.push(row_ids[idx]);
-            sorted_partition_ids.push(partition_ids[idx]);
+            sorted_row_ids.push(row_ids.value(idx));
+            sorted_partition_ids.push(part_ids[idx]);
             let start = idx * self.pq_code_width;
             let end = start + self.pq_code_width;
-            sorted_pq_values.extend_from_slice(&pq_values[start..end]);
+            sorted_pq_values.extend_from_slice(&pq_values.values()[start..end]);
         }
 
-        let final_path = self.final_bucket_path(bucket_id);
+        let file_offset = self.bucket_row_counts[bucket_id];
         let final_relative_path = self.final_bucket_relative_path(bucket_id);
-        let mut writer = FileWriter::try_new(
-            self.object_store.create(&final_path).await?,
-            Schema::try_from(self.final_schema.as_ref())?,
-            file_writer_options()?,
-        )?;
-        let final_batch = RecordBatch::try_new(
-            self.final_schema.clone(),
-            vec![
-                Arc::new(UInt64Array::from(sorted_row_ids)),
-                Arc::new(FixedSizeListArray::try_new_from_values(
-                    UInt8Array::from(sorted_pq_values),
-                    self.pq_code_width as i32,
-                )?),
-            ],
-        )?;
-        writer.write_batch(&final_batch).await?;
-        writer.finish().await?;
-
         let mut offset = 0usize;
         while offset < sorted_partition_ids.len() {
             let partition_id = sorted_partition_ids[offset] as usize;
@@ -467,28 +337,59 @@ impl PartitionArtifactBuilder {
             {
                 end += 1;
             }
-            partitions[partition_id] = PartitionArtifactPartition {
-                path: Some(final_relative_path.clone()),
-                num_rows: end - offset,
-                ranges: vec![PartitionArtifactRange {
-                    offset: offset as u64,
-                    num_rows: (end - offset) as u64,
-                }],
-            };
+            let partition = &mut self.partitions[partition_id];
+            match &partition.path {
+                Some(existing) if existing != &final_relative_path => {
+                    return Err(Error::io(format!(
+                        "partition {} is split across multiple bucket files: '{}' vs '{}'",
+                        partition_id, existing, final_relative_path
+                    )));
+                }
+                None => partition.path = Some(final_relative_path.clone()),
+                _ => {}
+            }
+            partition.num_rows += end - offset;
+            partition.ranges.push(PartitionArtifactRange {
+                offset: file_offset + offset as u64,
+                num_rows: (end - offset) as u64,
+            });
             offset = end;
         }
 
-        self.object_store.delete(&temp_path).await?;
-        Ok(Some(final_relative_path))
+        let pq_codes = FixedSizeListArray::try_new_from_values(
+            UInt8Array::from(sorted_pq_values),
+            self.pq_code_width as i32,
+        )?;
+        let final_batch = RecordBatch::try_new(
+            self.final_schema.clone(),
+            vec![
+                Arc::new(UInt64Array::from(sorted_row_ids)),
+                Arc::new(pq_codes),
+            ],
+        )?;
+        let writer = self.ensure_final_writer(bucket_id).await?;
+        writer.write_batch(&final_batch).await?;
+        self.bucket_row_counts[bucket_id] += total_rows as u64;
+        Ok(())
     }
 
-    /// Path of the temporary file used while accumulating one bucket.
-    fn temp_bucket_path(&self, bucket_id: usize) -> Path {
-        self.root_dir
-            .child(PARTITION_ARTIFACT_PARTITIONS_DIR)
-            .child(format!(
-                "{PARTITION_ARTIFACT_STAGING_PREFIX}{bucket_id:05}.lance"
-            ))
+    /// Lazily create the finalized writer for a bucket.
+    ///
+    /// Buckets that never receive rows never create a file, which keeps sparse
+    /// artifacts compact.
+    async fn ensure_final_writer(&mut self, bucket_id: usize) -> Result<&mut FileWriter> {
+        if self.final_writers[bucket_id].is_none() {
+            let path = self.final_bucket_path(bucket_id);
+            let writer = FileWriter::try_new(
+                self.object_store.create(&path).await?,
+                Schema::try_from(self.final_schema.as_ref())?,
+                file_writer_options()?,
+            )?;
+            self.final_writers[bucket_id] = Some(writer);
+        }
+        Ok(self.final_writers[bucket_id]
+            .as_mut()
+            .expect("final writer initialized"))
     }
 
     /// Path of the finalized file for one bucket.
@@ -1057,4 +958,51 @@ mod tests {
         .unwrap_err();
         assert!(matches!(error, Error::InvalidInput { .. }));
     }
+
+    #[tokio::test]
+    async fn partition_artifact_builder_records_multiple_ranges_for_repeated_flushes() {
+        let tempdir = tempfile::tempdir().unwrap();
+        let root_dir = tempdir.path().join("artifact");
+        fs::create_dir_all(&root_dir).unwrap();
+        let object_store = Arc::new(ObjectStore::local());
+        let root_path = Path::from_filesystem_path(&root_dir).unwrap();
+
+        let mut builder =
+            PartitionArtifactBuilder::try_new_with_store(object_store, root_path, 4, 2).unwrap();
+        let num_rows = PARTITION_ARTIFACT_BUCKET_BUFFER_ROWS + 1024;
+        let schema = Arc::new(ArrowSchema::new(vec![
+            Field::new(ROW_ID, DataType::UInt64, false),
+            Field::new(PART_ID_COLUMN, DataType::UInt32, false),
+            Field::new(
+                PQ_CODE_COLUMN,
+                DataType::FixedSizeList(Arc::new(Field::new("item", DataType::UInt8, true)), 2),
+                true,
+            ),
+        ]));
+        let row_ids = UInt64Array::from_iter_values((0..num_rows as u64).into_iter());
+        let part_ids = UInt32Array::from_iter_values((0..num_rows).map(|_| 0_u32));
+        let pq_values = UInt8Array::from_iter_values((0..num_rows * 2).map(|v| (v % 251) as u8));
+        let pq_codes = FixedSizeListArray::try_new_from_values(pq_values, 2).unwrap();
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![Arc::new(row_ids), Arc::new(part_ids), Arc::new(pq_codes)],
+        )
+        .unwrap();
+
+        builder.append_batch(&batch).await.unwrap();
+        builder.finish("metadata.lance", None).await.unwrap();
+
+        let manifest: PartitionArtifactManifest =
+            serde_json::from_slice(&fs::read(root_dir.join("manifest.json")).unwrap()).unwrap();
+        assert_eq!(manifest.partitions[0].num_rows, num_rows);
+        assert_eq!(manifest.partitions[0].ranges.len(), 2);
+        assert_eq!(
+            manifest.partitions[0].ranges[0].num_rows,
+            PARTITION_ARTIFACT_BUCKET_BUFFER_ROWS as u64
+        );
+        assert_eq!(
+            manifest.partitions[0].ranges[1].offset,
+            PARTITION_ARTIFACT_BUCKET_BUFFER_ROWS as u64
+        );
+    }
 }