diff --git a/.github/workflows/run-bench.yml b/.github/workflows/run-bench.yml
index 19762aaa5..6919b0e56 100644
--- a/.github/workflows/run-bench.yml
+++ b/.github/workflows/run-bench.yml
@@ -126,6 +126,20 @@ jobs:
           ref: ${{ matrix.branch }}
           fetch-depth: 0
 
+      # ==========================================
+      # Decode and write the protected dataset catalog
+      #
+      # TO UPDATE THIS SECRET:
+      # 1. On your local machine, run:
+      #    base64 -i jvector-examples/yaml-configs/dataset-catalogs/protected-catalog.yaml
+      # 2. Go to GitHub Repo -> Settings -> Secrets and variables -> Actions
+      # 3. Update the PROTECTED_CATALOG_YAML secret with the new Base64 string.
+      # ==========================================
+      - name: Inject Protected Catalog
+        run: |
+          mkdir -p jvector-examples/yaml-configs/dataset-catalogs
+          echo "${{ secrets.PROTECTED_CATALOG_YAML }}" | base64 -d > jvector-examples/yaml-configs/dataset-catalogs/protected-catalog.yaml
+
       # Create a directory to store benchmark results
       - name: Create results directory
         run: mkdir -p benchmark_results
@@ -137,8 +151,6 @@ jobs:
       # Run the benchmark if jvector-examples exists
       - name: Run benchmark
         id: run-benchmark
-        env:
-          DATASET_HASH: ${{ secrets.DATASETS_KEYPATH }}
         run: |
           # Check if jvector-examples directory and AutoBenchYAML class exist
           if [ ! -d "jvector-examples" ]; then
diff --git a/.gitignore b/.gitignore
index ea443370d..4b5599f84 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,10 +3,19 @@ local/
 .mvn/wrapper/maven-wrapper.jar
 .java-version
 .bob/
+dataset_
+**/local_datasets/**
 
 ### Bench caches
 pq_cache/
 index_cache/
+dataset_cache/
+
+### Data catalogs
+jvector-examples/yaml-configs/dataset-catalogs/*.yaml
+jvector-examples/yaml-configs/dataset-catalogs/*.yml
+!jvector-examples/yaml-configs/dataset-catalogs/public-catalog.yaml
+jvector-examples/yaml-configs/dataset-catalogs/.catalog-cache/
 
 ### Logging (or whatever you use)
 logging/
@@ -49,3 +58,5 @@ hdf5/
 # JMH generated files
 dependency-reduced-pom.xml
 results.csv
+**/datasets/custom/**
+**/dataset_cache/**
diff --git a/README.md b/README.md
index cb9843336..10e9eb738 100644
--- a/README.md
+++ b/README.md
@@ -74,7 +74,7 @@ You may also use method-level filtering and patterns, e.g.,
 (The `failIfNoSpecifiedTests` option works around a quirk of surefire: it is happy to run `test` with submodules with empty test sets,
 but as soon as you supply a filter, it wants at least one match in every submodule.)
 
-You can run `SiftSmall` and `Bench` directly to get an idea of what all is going on here. `Bench` will automatically download required datasets to the `fvec` and `hdf5` directories.
+You can run `SiftSmall` and `Bench` directly to get an idea of what all is going on here. `Bench` will automatically download required datasets to the `dataset_cache` directory.
 The files used by `SiftSmall` can be found in the [siftsmall directory](./siftsmall) in the project root.
 
 To run either class, you can use the Maven exec-plugin via the following incantations:
diff --git a/docs/benchmarking.md b/docs/benchmarking.md
index e27113dc1..79ddba82e 100644
--- a/docs/benchmarking.md
+++ b/docs/benchmarking.md
@@ -4,21 +4,19 @@ JVector comes with a built-in benchmarking system in `jvector-examples/.../Bench
 
 To run a benchmark
 - Decide which dataset(s) you want to benchmark. A dataset consists of
-    - The vectors to be indexed, usually called the "base" or "target" vectors.
-    - The query vectors.
-    - The "ground truth" results which are used to compute accuracy metrics.
-    - The similarity metric which should have been used to compute the ground truth (dot product, cosine similarity or L2 distance).
-- Configure the parameters combinations for which you want to run the benchmark. This includes graph index parameters, quantization parameters and search parameters.
+    - The vectors to be indexed, usually called the "base" or "target" vectors
+    - The query vectors
+    - The "ground truth" results that are used to compute accuracy metrics
+    - The similarity metric used compute the ground truth (dot product, cosine similarity or L2 distance)
+- Configure the parameters combinations for which you want to run the benchmark. This includes index construction parameters, quantization parameters and search parameters.
 
-JVector supports two types of datasets:
-- **Fvec/Ivec**: The dataset consists of three files, for example `base.fvec`, `queries.fvec` and `neighbors.ivec` containing the base vectors, query vectors, and ground truth. (`fvec` and `ivec` file formats are described [here](http://corpus-texmex.irisa.fr/))
-- **HDF5**: The dataset consists of a single HDF5 file with three datasets labelled `train`, `test` and `neighbors`, representing the base vectors, query vectors and the ground truth.
+JVector supports datasets in the fvecs/ivecs format.  These consist of three files, for example `base.fvecs`, `queries.fvecs` and `neighbors.ivecs` containing the base vectors, query vectors, and ground truth. (`fvecs` and `ivecs` file formats are described [here](http://corpus-texmex.irisa.fr/))
 
 The general procedure for running benchmarks is mentioned below. The following sections describe the process in more detail.
 - [Specify the dataset](#specifying-datasets) names to benchmark in `datasets.yml`.
 - Certain datasets will be downloaded automatically. If using a different dataset, make sure the dataset files are downloaded and made available (refer the section on [Custom datasets](#custom-datasets)).
-- Adjust the benchmark parameters in `default.yml`. This will affect the parameters for all datasets to be benchmarked. You can specify custom parameters for a specific dataset by creating a file called `<your-dataset-name>.yml` in the same folder.
-- Decide on the kind of measurements and logging you want and configure them in `run.yml`.
+- Adjust the benchmark parameters in `default.yml`. This will affect the parameters for all datasets benchmarked. You can specify custom parameters for a specific dataset by creating a file called `<your-dataset-name>.yml` in the `index-parameters` subfolder.
+- Decide on the kind of measurements and logging you want and configure them in `run-config.yml`.
 
 You can run the configured benchmark with maven:
 ```sh
@@ -31,31 +29,28 @@ The datasets you want to benchmark should be specified in `jvector-examples/yaml
 
 To benchmark a single dataset, comment out the entries corresponding to all other datasets. (Or provide command line arguments as described in [Running `bench` from the command line](#running-bench-from-the-command-line))
 
-Datasets are assumed to be Fvec/Ivec based unless the entry in the `datasets.yml` ends with `.hdf5`. In this case, `.hdf5` is not considered part of the "dataset name" referenced in other sections.
+Datasets are grouped into categories. The categories can be arbitrarily chosen for convenience and are not currently considered by the benchmarking system.
 
-You'll notice that datasets are grouped into categories. The categories can be arbitrarily chosen for convenience and are not currently considered by the benchmarking system.
-
-For HDF5 files, the substrings `-angular`, `-euclidean` and `-dot` correspond to cosine similarity, L2 distance, and dot product similarity functions (these substrings ARE considered to be part of the "dataset name"). Currently, Fvec/Ivec datasets are implicitly assumed to use cosine similarity (changing this requires editing `DataSetLoaderMFD.java`).
+Dataset similarity functions are configured in `jvector-examples/yaml-configs/dataset-metadata.yml`.
 
 Example `datasets.yml`:
 
 ```yaml
 category0:
-  - my-fvec-dataset                      # fvec/ivec dataset, cosine similarity
-  - my-hdf5-dataset-angular.hdf5         # hdf5 dataset, cosine similarity
+  - my-dataset-a
+  - my-dataset-b
 some-other-category:
-  - a-huge-dataset-1024d-euclidean.hdf5  # hdf5 dataset, L2 similarity
-  - my-simple-dataset-dot.hdf5           # hdf5 dataset, dot product similarity
-  - some-dataset-euclidean               # fvec/ivec dataset, cosine similarity (NOT L2 unless you change the code!)
+  - another-dataset-a
+  - another-dataset-b
 ```
 
 ## Setting benchmark parameters
 
 ### default.yml / \<dataset-name\>.yml
 
-`jvector-examples/yaml-configs/default.yml` specifies the default index construction and search parameters to be used by `bench` for all datasets.
+`jvector-examples/yaml-configs/index-parameters/default.yml` specifies the default index construction and search parameters to be used by `bench` for all datasets.
 
-You can specify a custom set of a parameters for any given dataset by creating a file called `<dataset-name>.yml`, with `<dataset-name>` replaced by the actual name of the dataset. This is the same as the identifier used in `datasets.yml`, but without the `.hdf5` suffix for hdf5 datasets. The format of this file is exactly the same as `default.yml`.
+You can specify a custom set of a parameters for any given dataset by creating a file called `<dataset-name>.yml`, with `<dataset-name>` replaced by the actual name of the dataset. This is the same as the identifier used in `datasets.yml`. The format of this file is exactly the same as `default.yml`.
 
 Refer to `default.yml` for a list of all options.
 
@@ -67,7 +62,7 @@ construction:
 ```
 will build and benchmark four graphs, one for each combination of M and ef in {(32, 100), (64, 100), (32, 200), (64, 200)}. This is particularly useful when running a Grid search to identify the best performing parameters.
 
-### run.yml
+### run-config.yml
 
 This file contains configurations for
 - Specifying the measurements you want to report, like QPS, latency and recall
@@ -75,7 +70,7 @@ This file contains configurations for
 
 The configurations in this file are "run-level", meaning that they are shared across all the datasets being benchmarked.
 
-See `run.yml` for a full list of all options.
+See `run-config.yml` for a full list of all options.
 
 ## Running `bench` from the command line
 
@@ -86,45 +81,37 @@ mvn compile exec:exec@bench -pl jvector-examples -am
 
 To benchmark a subset of the datasets in `datasets.yml`, you can provide a space-separated list of regexes as arguments.
 ```sh
-# matches `glove-25-angular.hdf5`, `glove-50-angular.hdf5`, `nytimes-256-angular.hdf5` etc
+# matches `glove-25-angular`, `glove-50-angular`, `nytimes-256-angular` etc
 mvn compile exec:exec@bench -pl jvector-examples -am -DbenchArgs="glove nytimes"
 ```
 
 ## Custom Datasets
 
-### Custom Fvec/Ivec datasets
-
-Using fvec/ivec datasets requires them to be configured in `DataSetLoaderMFD.java`. Some datasets are already pre-configured; these will be downloaded and used automatically on running the benchmark.
-
-To use a custom dataset consisting of files `base.fvec`, `queries.fvec` and `neighbors.ivec`, do the following:
-- Ensure that you have three files:
-    - `base.fvec` containing N D-dimensional float vectors. These are used to build the index.
-    - `queries.fvec` containing Q D-dimensional float vectors. These are used for querying the built index.
-    - `neighbors.ivec` containing Q K-dimensional integer vectors, one for each query vector, representing the exact K-nearest neighbors for that query among the base vectors.
-    The files can be named however you like.
-- Save all three files somewhere in the `fvec` directory in the root of the `jvector` repo (if it doesn't exist, create it). It's recommended to create at least one sub-folder with the name of the dataset and copy or move all three files there.
-- Edit `DataSetLoaderMFD.java` to configure a new dataset and it's associated files:
-    ```java
-    put("cust-ds", new MultiFileDatasource("cust-ds",
-            "cust-ds/base.fvec",
-            "cust-ds/query.fvec",
-            "cust-ds/neighbors.ivec"));
+Datasets are configured via YAML catalog files under `jvector-examples/yaml-configs/dataset-catalogs/`. The loader recursively discovers all `.yaml`/`.yml` files in that directory tree. See `jvector-examples/yaml-configs/dataset-catalogs/local-catalog.yaml` for the full format reference.
+
+To add a custom fvecs/ivecs dataset:
+
+1. Add a `.yaml` file to the YAML catalog directory, mapping your dataset name to its files:
+    ```yaml
+    _defaults:
+      cache_dir: ${DATASET_CACHE_DIR:-dataset_cache}
+
+    my-dataset:
+      base: my_base_vectors.fvecs
+      query: my_query_vectors.fvecs
+      gt: my_ground_truth.ivecs
+    ```
+2. Place your fvecs/ivecs files at the paths you specified in the YAML (or specify a `cache_dir` / `base_url` to fetch them from a remote source).
+3. Add the dataset's similarity function to `jvector-examples/yaml-configs/dataset-metadata.yml`:
+    ```yaml
+    my-dataset:
+      similarity_function: COSINE
+      load_behavior: NO_SCRUB
     ```
-    The file paths are resolved relative to the `fvec` directory. `cust-ds` is the name of the dataset and can be changed to whatever is appropriate.
-- In `jvector-examples/yaml-configs/datasets.yml`, add an entry corresponding to your custom dataset. Comment out other datasets which you do not want to benchmark.
+4. Add the dataset name to `jvector-examples/yaml-configs/datasets.yml` so BenchYAML can find it:
     ```yaml
     custom:
-      - cust-ds
+      - my-dataset
     ```
 
-## Custom HDF5 datasets
-
-HDF5 datasets consist of a single file. The Hdf5Loader looks for three HDF5 datasets within the file, `train`, `test` and `neighbors`. These correspond to the base, query and neighbors vectors described above for fvec/ivec files.
-
-To use an HDF5 dataset, edit `jvector-examples/yaml-configs/datasets.yml` to add an entry like the following:
-```yaml
-category:
-  - <dataset-name>.hdf5
-```
-
-BenchYAML looks for hdf5 datasets with the name `<dataset-name>.hdf5` in the `hdf5` folder in the root of this repo. If the file doesn't exist, BenchYAML will attempt to automatically download the dataset from ann-benchmarks.com. If your dataset is not from ann-benchmarks.com, simply ensure that the dataset is available in the `hdf5` folder and edit `datasets.yml` accordingly.
+For remote datasets, use `base_url` to specify where files should be downloaded from. The `${VAR}` and `${VAR:-default}` syntax is supported for environment variable expansion. See the example config for details.
diff --git a/jvector-examples/README.md b/jvector-examples/README.md
index 27c09b5d4..c782f7db8 100644
--- a/jvector-examples/README.md
+++ b/jvector-examples/README.md
@@ -11,8 +11,8 @@ A simple benchmark for the sift dataset located in the [siftsmall](./siftsmall)
 Performs grid search across the `GraphIndexBuilder` parameter space to find
 the best tradeoffs between recall and throughput.  
 
-This benchmark requires datasets from [https://github.com/erikbern/ann-benchmarks](https://github.com/erikbern/ann-benchmarks/blob/main/README.md#data-sets) to be downloaded to hdf5 and fvec 
-directories `hdf5` or `fvec` under the project root depending on the dataset format. 
+This benchmark requires `fvecs' versions of datasets from [https://github.com/erikbern/ann-benchmarks](https://github.com/erikbern/ann-benchmarks/blob/main/README.md#data-sets) to be downloaded to `dataset_cache` 
+directory under the project root. 
 
 You can use [`plot_output.py`](./plot_output.py) to graph the [pareto-optimal points](https://en.wikipedia.org/wiki/Pareto_efficiency) found by `Bench`.
 
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java
index 343fcbd95..e066a34dc 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java
@@ -94,11 +94,11 @@ public static void main(String[] args) throws IOException {
             RunConfig runCfg = RunConfig.loadDefault();
             artifacts = RunArtifacts.open(runCfg, allConfigs);
         } catch (java.io.FileNotFoundException e) {
-            // Legacy yamlSchemaVersion "0" behavior: no run.yml
+            // Legacy yamlSchemaVersion "0" behavior: no run-config.yml
             // - logging disabled
             // - console shows compute selection
             // - compute selection comes from legacy search.benchmarks if present, else default
-            System.err.println("WARNING: run.yml not found. Falling back to deprecated legacy behavior: "
+            System.err.println("WARNING: run-config.yml not found. Falling back to deprecated legacy behavior: "
                     + "no logging, console mirrors computed benchmarks.");
 
             Map<String, List<String>> legacyBenchmarks = null;
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java
index 032ea2f6c..ea4752e4b 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java
@@ -16,7 +16,7 @@
 
 package io.github.jbellis.jvector.example;
 
-import io.github.jbellis.jvector.example.benchmarks.datasets.DataSetLoaderMFD;
+import io.github.jbellis.jvector.example.benchmarks.datasets.DataSets;
 import io.github.jbellis.jvector.example.reporting.RunArtifacts;
 import io.github.jbellis.jvector.example.yaml.MultiConfig;
 import io.github.jbellis.jvector.example.yaml.RunConfig;
@@ -36,9 +36,8 @@ public static void main(String[] args) throws IOException {
         // Run-level policy config (benchmarks/console/logging + run metadata)
         RunConfig runCfg = RunConfig.loadDefault();
 
-        // Load dataset
-        var ds = new DataSetLoaderMFD().loadDataSet(datasetName)
-                .orElseThrow(() -> new RuntimeException("dataset " + datasetName + " not found"))
+        var ds = DataSets.loadDataSet(datasetName).orElseThrow(
+                () -> new RuntimeException("dataset " + datasetName + " not found"))
                 .getDataSet();
 
         // Run artifacts + selections (sys_info/dataset_info/experiments.csv)
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java
deleted file mode 100644
index 3c218c85f..000000000
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright DataStax, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package io.github.jbellis.jvector.example.benchmarks.datasets;
-
-import io.github.jbellis.jvector.vector.VectorizationProvider;
-import io.github.jbellis.jvector.vector.types.VectorFloat;
-import io.github.jbellis.jvector.vector.types.VectorTypeSupport;
-import io.jhdf.HdfFile;
-import io.jhdf.api.Dataset;
-import io.jhdf.object.datatype.FloatingPoint;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.net.HttpURLConnection;
-import java.net.URL;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.StandardCopyOption;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Map;
-import java.util.Optional;
-import java.util.stream.IntStream;
-
-/**
- * This dataset loader will get and load hdf5 files from <a href="https://ann-benchmarks.com/">ann-benchmarks</a>.
- *
- * <p>For curated benchmark datasets, properties are provided by
- * {@code dataset_metadata.yml} via {@link DataSetMetadataReader}. If the metadata
- * does not provide a similarity function, an error is thrown.
- */
-public class DataSetLoaderHDF5 implements DataSetLoader {
-    public static final Path HDF5_DIR = Path.of("hdf5");
-    private static final VectorTypeSupport vectorTypeSupport = VectorizationProvider.getInstance().getVectorTypeSupport();
-    public static final String HDF5_EXTN = ".hdf5";
-    private static final DataSetMetadataReader metadata = DataSetMetadataReader.load();
-
-    /**
-     * {@inheritDoc}
-     */
-    public Optional<DataSetInfo> loadDataSet(String datasetName) {
-        return maybeDownloadHdf5(datasetName).map(path -> {
-            var props = getProperties(datasetName);
-            props.similarityFunction()
-                    .orElseThrow(() -> new IllegalArgumentException(
-                            "No similarity function configured in dataset_metadata.yml for HDF5 dataset: " + datasetName));
-            return new DataSetInfo(props, () -> readHdf5Data(path, props));
-        });
-    }
-
-    /// Reads base vectors, query vectors, and ground truth from an HDF5 file
-    /// and returns a {@link DataSet} using the configured dataset properties.
-    private DataSet readHdf5Data(Path path, DataSetProperties props) {
-        VectorFloat<?>[] baseVectors;
-        VectorFloat<?>[] queryVectors;
-        var gtSets = new ArrayList<List<Integer>>();
-        try (HdfFile hdf = new HdfFile(path)) {
-            var baseVectorsArray =
-                    (float[][]) hdf.getDatasetByPath("train").getData();
-            baseVectors = IntStream.range(0, baseVectorsArray.length).parallel().mapToObj(i -> vectorTypeSupport.createFloatVector(baseVectorsArray[i])).toArray(VectorFloat<?>[]::new);
-            Dataset queryDataset = hdf.getDatasetByPath("test");
-            if (((FloatingPoint) queryDataset.getDataType()).getBitPrecision() == 64) {
-                // lastfm dataset contains f64 queries but f32 everything else
-                var doubles = ((double[][]) queryDataset.getData());
-                queryVectors = IntStream.range(0, doubles.length).parallel().mapToObj(i -> {
-                    var a = new float[doubles[i].length];
-                    for (int j = 0; j < doubles[i].length; j++) {
-                        a[j] = (float) doubles[i][j];
-                    }
-                    return vectorTypeSupport.createFloatVector(a);
-                }).toArray(VectorFloat<?>[]::new);
-            } else {
-                var queryVectorsArray = (float[][]) queryDataset.getData();
-                queryVectors = IntStream.range(0, queryVectorsArray.length).parallel().mapToObj(i -> vectorTypeSupport.createFloatVector(queryVectorsArray[i])).toArray(VectorFloat<?>[]::new);
-            }
-            int[][] groundTruth = (int[][]) hdf.getDatasetByPath("neighbors").getData();
-            gtSets = new ArrayList<>(groundTruth.length);
-            for (int[] i : groundTruth) {
-                var gtSet = new ArrayList<Integer>(i.length);
-                for (int j : i) {
-                    gtSet.add(j);
-                }
-                gtSets.add(gtSet);
-            }
-        }
-
-        return DataSetUtils.processDataSet(
-                path.getFileName().toString(),
-                props,
-                Arrays.asList(baseVectors),
-                Arrays.asList(queryVectors),
-                gtSets);
-    }
-
-    /// Looks up dataset properties in {@code dataset_metadata.yml}.
-    ///
-    /// @param datasetName the logical dataset name (without {@code .hdf5} extension)
-    /// @return the dataset properties, or a minimal name-only property set if no entry exists
-    private static DataSetProperties getProperties(String datasetName) {
-        return metadata.getProperties(datasetName)
-                .orElse(new DataSetProperties.PropertyMap(Map.of(DataSetProperties.KEY_NAME, datasetName)));
-    }
-
-    /// Downloads the HDF5 file for the given dataset if it is not already present locally.
-    ///
-    /// @param datasetName the logical dataset name (without {@code .hdf5} extension)
-    /// @return the local path to the HDF5 file, or empty if the remote file was not found
-    private Optional<Path> maybeDownloadHdf5(String datasetName) {
-        var dsFilePath = HDF5_DIR.resolve(datasetName+HDF5_EXTN);
-
-        if (Files.exists(dsFilePath)) {
-            return Optional.of(dsFilePath);
-        }
-
-        // Download from https://ann-benchmarks.com/datasetName
-        var url = "https://ann-benchmarks.com/" + datasetName + HDF5_EXTN;
-
-        HttpURLConnection connection;
-        while (true) {
-            int responseCode;
-            try {
-                connection = (HttpURLConnection) new URL(url).openConnection();
-                responseCode = connection.getResponseCode();
-            } catch (IOException e) {
-                throw new RuntimeException(e);
-            }
-            if (responseCode == HttpURLConnection.HTTP_NOT_FOUND) {
-                return Optional.empty();
-            }
-            if (responseCode == HttpURLConnection.HTTP_MOVED_PERM || responseCode == HttpURLConnection.HTTP_MOVED_TEMP) {
-                String newUrl = connection.getHeaderField("Location");
-                System.out.println("Redirect detected to URL: " + newUrl);
-                url = newUrl;
-            } else {
-                break;
-            }
-        }
-
-        try (InputStream in = connection.getInputStream()) {
-            Files.createDirectories(dsFilePath.getParent());
-            System.out.println("Downloading: " + url);
-            Files.copy(in, dsFilePath, StandardCopyOption.REPLACE_EXISTING);
-        } catch (IOException e) {
-            throw new RuntimeException("Error downloading data:" + e.getMessage(),e);
-        }
-        return Optional.of(dsFilePath);
-    }
-
-}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java
deleted file mode 100644
index b38d2daf1..000000000
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java
+++ /dev/null
@@ -1,305 +0,0 @@
-/*
- * Copyright DataStax, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package io.github.jbellis.jvector.example.benchmarks.datasets;
-
-import io.github.jbellis.jvector.example.util.SiftLoader;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import software.amazon.awssdk.auth.credentials.AnonymousCredentialsProvider;
-import software.amazon.awssdk.http.crt.AwsCrtAsyncHttpClient;
-import software.amazon.awssdk.regions.Region;
-import software.amazon.awssdk.services.s3.S3AsyncClient;
-import software.amazon.awssdk.services.s3.S3AsyncClientBuilder;
-import software.amazon.awssdk.transfer.s3.S3TransferManager;
-import software.amazon.awssdk.transfer.s3.model.CompletedFileDownload;
-import software.amazon.awssdk.transfer.s3.model.DownloadFileRequest;
-import software.amazon.awssdk.transfer.s3.model.FileDownload;
-import software.amazon.awssdk.transfer.s3.progress.LoggingTransferListener;
-
-import java.io.BufferedInputStream;
-import java.io.DataInputStream;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.*;
-
-/**
- * This dataset loader supports <i>multi-file</i> datasets which are comprised of several files as defined in
- * {@link DataSetLoaderMFD.MultiFileDatasource}.
- *
- * <p>The vector similarity function is determined by looking up the dataset name in
- * {@code dataset_metadata.yml} via {@link DataSetMetadataReader}. If no entry is found,
- * an error is thrown.
- */
-public class DataSetLoaderMFD implements DataSetLoader {
-
-    private static final Logger logger = LoggerFactory.getLogger(DataSetLoaderMFD.class);
-
-    private final static Set<String> infraDatasets = Set.of("dpr-1M", "dpr-10M", "cap-1M", "cap-6M", "cohere-english-v3-1M", "cohere-english-v3-10M");
-    private static final String infraBucketName = "jvector-datasets-infratest";
-    private static final String fvecDir = "fvec";
-    private static final String bucketName = "astra-vector";
-    private static final List<String> bucketNames = List.of(bucketName, infraBucketName);
-    private static final DataSetMetadataReader metadata = DataSetMetadataReader.load();
-
-    /**
-     * {@inheritDoc}
-     */
-    public Optional<DataSetInfo> loadDataSet(String fileName) {
-        return maybeDownloadFvecs(fileName).map(mfd -> {
-            var props = metadata.getProperties(mfd.name)
-                    .orElseThrow(() -> new IllegalArgumentException(
-                            "No metadata configured in dataset_metadata.yml for MFD dataset: " + mfd.name));
-            props.similarityFunction()
-                    .orElseThrow(() -> new IllegalArgumentException(
-                            "No similarity_function configured in dataset_metadata.yml for MFD dataset: " + mfd.name));
-            return new DataSetInfo(props, () -> mfd.load(props));
-        });
-    }
-
-    /// Downloads the fvec/ivec files for the named dataset from S3 if not already present locally.
-    ///
-    /// @param name the logical dataset name
-    /// @return the datasource descriptor, or empty if the name is not a known multi-file dataset
-    private Optional<MultiFileDatasource> maybeDownloadFvecs(String name) {
-        String bucket = infraDatasets.contains(name) ? infraBucketName : bucketName;
-        var mfd = MultiFileDatasource.byName.get(name);
-        if (mfd == null) {
-            logger.debug("MultiFileDatasource not found for name: [" + name + "]");
-            return Optional.empty();
-        }
-        logger.info("found dataset definition for {}", name);
-
-        // TODO how to detect and recover from incomplete downloads?
-
-        // get directory from paths in keys
-        Path fvecPath = Paths.get(fvecDir);
-        try {
-            Files.createDirectories(fvecPath.resolve(mfd.directory()));
-        } catch (IOException e) {
-            throw new RuntimeException("Failed to create directory: " + fvecDir, e);
-        }
-
-        try (S3AsyncClient s3Client = s3AsyncClientBuilder().build()) {
-            S3TransferManager tm = S3TransferManager.builder().s3Client(s3Client).build();
-            for (var pathFragment : mfd.paths()) {
-                Path localPath = fvecPath.resolve(pathFragment);
-                if (Files.exists(localPath)) {
-                    continue;
-                }
-
-                var urlPath = pathFragment.toString().replace('\\', '/');
-                logger.info("Downloading dataset {} from {}", name, urlPath);
-                DownloadFileRequest downloadFileRequest =
-                        DownloadFileRequest.builder()
-                                .getObjectRequest(b -> b.bucket(bucket).key(urlPath))
-                                .addTransferListener(LoggingTransferListener.create())
-                                .destination(Paths.get(localPath.toString()))
-                                .build();
-
-                // 3 retries
-                boolean downloaded = false;
-                for (int i = 0; i < 3; i++) {
-                    try {
-                        FileDownload downloadFile = tm.downloadFile(downloadFileRequest);
-                        CompletedFileDownload downloadResult = downloadFile.completionFuture().join();
-                        long downloadedSize = Files.size(localPath);
-
-                        // Check if downloaded file size matches the expected size
-                        if (downloadedSize != downloadResult.response().contentLength()) {
-                            logger.error("Incomplete download (got {} of {} bytes). Retrying...",
-                                    downloadedSize, downloadResult.response().contentLength());
-                            Files.deleteIfExists(localPath);
-                            continue;
-                        }
-
-                        // Validate the file header to catch corrupt downloads
-                        if (!validateVecFileHeader(localPath)) {
-                            logger.error("Downloaded file {} has an invalid header; deleting and retrying", urlPath);
-                            Files.deleteIfExists(localPath);
-                            continue;
-                        }
-
-                        logger.info("Downloaded file of length " + downloadedSize);
-                        downloaded = true;
-                        break;
-                    } catch (Exception e) {
-                        logger.error("Download attempt {} failed for {}: {}", i + 1, urlPath, e.getMessage());
-                        Files.deleteIfExists(localPath);
-                    }
-                }
-                if (!downloaded) {
-                    throw new IOException("Failed to download " + urlPath + " after 3 attempts");
-                }
-            }
-            tm.close();
-        } catch (Exception e) {
-            throw new RuntimeException("Error downloading data from S3: " + e.getMessage());
-        }
-
-        return Optional.of(mfd);
-    }
-
-    /// Reads the first 4 bytes of a vec file (fvecs or ivecs) and checks that the
-    /// little-endian int32 dimension/count value is positive and reasonable.
-    private static boolean validateVecFileHeader(Path path) {
-        try (var dis = new DataInputStream(new BufferedInputStream(new FileInputStream(path.toFile())))) {
-            int dimension = Integer.reverseBytes(dis.readInt());
-            return dimension > 0 && dimension <= 100_000;
-        } catch (IOException e) {
-            return false;
-        }
-    }
-
-    /// Creates an S3 async client builder configured for anonymous access to US-EAST-1.
-    private static S3AsyncClientBuilder s3AsyncClientBuilder() {
-        return S3AsyncClient.builder()
-                .region(Region.US_EAST_1)
-                .httpClient(AwsCrtAsyncHttpClient.builder()
-                        .maxConcurrency(16)
-                        .build())
-                .credentialsProvider(AnonymousCredentialsProvider.create());
-    }
-
-    /// Describes a dataset stored as three separate fvec/ivec files (base vectors, query
-    /// vectors, and ground truth) in an S3 bucket. Known datasets are registered in {@link #byName}.
-    public static class MultiFileDatasource {
-        public final String name;
-        public final Path basePath;
-        public final Path queriesPath;
-        public final Path groundTruthPath;
-        private final static String DATASET_HASH = System.getenv("DATASET_HASH");
-
-        public MultiFileDatasource(String name, String basePath, String queriesPath, String groundTruthPath) {
-            this.name = name;
-            this.basePath = Paths.get(basePath);
-            this.queriesPath = Paths.get(queriesPath);
-            this.groundTruthPath = Paths.get(groundTruthPath);
-        }
-
-        /// Returns the parent directory of the base vectors file.
-        public Path directory() {
-            return basePath.getParent();
-        }
-
-        /// Returns the three file paths (base, queries, ground truth) that comprise this dataset.
-        public Iterable<Path> paths() {
-            return List.of(basePath, queriesPath, groundTruthPath);
-        }
-
-        /// Reads the fvec/ivec files from disk and processes the dataset using the
-        /// configured dataset properties.
-        ///
-        /// @param props the dataset properties controlling similarity and load behavior
-        /// @return the loaded dataset
-        public DataSet load(DataSetProperties props) {
-            var baseVectors = SiftLoader.readFvecs("fvec/" + basePath);
-            var queryVectors = SiftLoader.readFvecs("fvec/" + queriesPath);
-            var gtVectors = SiftLoader.readIvecs("fvec/" + groundTruthPath);
-            return DataSetUtils.processDataSet(name, props, baseVectors, queryVectors, gtVectors);
-        }
-
-        public static Map<String, MultiFileDatasource> byName = new HashMap<>() {{
-            put("degen-200k", new MultiFileDatasource("degen-200k",
-                                                       "ada-degen/degen_base_vectors.fvec",
-                                                       "ada-degen/degen_query_vectors.fvec",
-                                                       "ada-degen/degen_ground_truth.ivec"));
-            put("cohere-english-v3-100k", new MultiFileDatasource("cohere-english-v3-100k",
-                                                                  "wikipedia_squad/100k/cohere_embed-english-v3.0_1024_base_vectors_100000.fvec",
-                                                                  "wikipedia_squad/100k/cohere_embed-english-v3.0_1024_query_vectors_10000.fvec",
-                                                                  "wikipedia_squad/100k/cohere_embed-english-v3.0_1024_indices_b100000_q10000_k100.ivec"));
-            put("cohere-english-v3-1M", new MultiFileDatasource("cohere-english-v3-1M",
-                    DATASET_HASH + "/cohere/cohere_wikipedia_v3/cohere_wiki_en_flat_base_1m_norm.fvecs",
-                    DATASET_HASH + "/cohere/cohere_wikipedia_v3/cohere_wiki_en_flat_query_10k_norm.fvecs",
-                    DATASET_HASH + "/cohere/cohere_wikipedia_v3/cohere_wiki_en_flat_gt_1m_ip_k100.ivecs"));
-            put("cohere-english-v3-10M", new MultiFileDatasource("cohere-english-v3-10M",
-                    DATASET_HASH + "/cohere/cohere_wikipedia_v3/cohere_wiki_en_flat_base_10m_norm.fvecs",
-                    DATASET_HASH + "/cohere/cohere_wikipedia_v3/cohere_wiki_en_flat_query_10k_norm.fvecs",
-                    DATASET_HASH + "/cohere/cohere_wikipedia_v3/cohere_wiki_en_flat_gt_10m_ip_k100.ivecs"));
-            put("colbert-10M", new MultiFileDatasource("colbert-10M",
-                                                       "wikipedia_squad/10M/colbertv2.0_128_base_vectors_10000000.fvec",
-                                                       "wikipedia_squad/10M/colbertv2.0_128_query_vectors_100000.fvec",
-                                                       "wikipedia_squad/10M/colbertv2.0_128_indices_b10000000_q100000_k100.ivec"));
-            put("colbert-1M", new MultiFileDatasource("colbert-1M",
-                                                       "wikipedia_squad/1M/colbertv2.0_128_base_vectors_1000000.fvec",
-                                                       "wikipedia_squad/1M/colbertv2.0_128_query_vectors_100000.fvec",
-                                                       "wikipedia_squad/1M/colbertv2.0_128_indices_b1000000_q100000_k100.ivec"));
-            put("nv-qa-v4-100k", new MultiFileDatasource("nv-qa-v4-100k",
-                                                         "wikipedia_squad/100k/nvidia-nemo_1024_base_vectors_100000.fvec",
-                                                         "wikipedia_squad/100k/nvidia-nemo_1024_query_vectors_10000.fvec",
-                                                         "wikipedia_squad/100k/nvidia-nemo_1024_indices_b100000_q10000_k100.ivec"));
-            put("openai-v3-large-3072-100k", new MultiFileDatasource("openai-v3-large-3072-100k",
-                                                                     "wikipedia_squad/100k/text-embedding-3-large_3072_100000_base_vectors.fvec",
-                                                                     "wikipedia_squad/100k/text-embedding-3-large_3072_100000_query_vectors_10000.fvec",
-                                                                     "wikipedia_squad/100k/text-embedding-3-large_3072_100000_indices_query_10000.ivec"));
-            put("openai-v3-large-1536-100k", new MultiFileDatasource("openai-v3-large-1536-100k",
-                                                                     "wikipedia_squad/100k/text-embedding-3-large_1536_100000_base_vectors.fvec",
-                                                                     "wikipedia_squad/100k/text-embedding-3-large_1536_100000_query_vectors_10000.fvec",
-                                                                     "wikipedia_squad/100k/text-embedding-3-large_1536_100000_indices_query_10000.ivec"));
-            put("openai-v3-small-100k", new MultiFileDatasource("openai-v3-small-100k",
-                                                                "wikipedia_squad/100k/text-embedding-3-small_1536_100000_base_vectors.fvec",
-                                                                "wikipedia_squad/100k/text-embedding-3-small_1536_100000_query_vectors_10000.fvec",
-                                                                "wikipedia_squad/100k/text-embedding-3-small_1536_100000_indices_query_10000.ivec"));
-            put("ada002-100k", new MultiFileDatasource("ada002-100k",
-                                                       "wikipedia_squad/100k/ada_002_100000_base_vectors.fvec",
-                                                       "wikipedia_squad/100k/ada_002_100000_query_vectors_10000.fvec",
-                                                       "wikipedia_squad/100k/ada_002_100000_indices_query_10000.ivec"));
-            put("ada002-1M", new MultiFileDatasource("ada002-1M",
-                                                     "wikipedia_squad/1M/ada_002_1000000_base_vectors.fvec",
-                                                     "wikipedia_squad/1M/ada_002_1000000_query_vectors_10000.fvec",
-                                                     "wikipedia_squad/1M/ada_002_1000000_indices_query_10000.ivec"));
-            put("e5-small-v2-100k", new MultiFileDatasource("e5-small-v2-100k",
-                                                            "wikipedia_squad/100k/intfloat_e5-small-v2_100000_base_vectors.fvec",
-                                                            "wikipedia_squad/100k/intfloat_e5-small-v2_100000_query_vectors_10000.fvec",
-                                                            "wikipedia_squad/100k/intfloat_e5-small-v2_100000_indices_query_10000.ivec"));
-            put("e5-base-v2-100k", new MultiFileDatasource("e5-base-v2-100k",
-                                                           "wikipedia_squad/100k/intfloat_e5-base-v2_100000_base_vectors.fvec",
-                                                           "wikipedia_squad/100k/intfloat_e5-base-v2_100000_query_vectors_10000.fvec",
-                                                           "wikipedia_squad/100k/intfloat_e5-base-v2_100000_indices_query_10000.ivec"));
-            put("e5-large-v2-100k", new MultiFileDatasource("e5-large-v2-100k",
-                                                            "wikipedia_squad/100k/intfloat_e5-large-v2_100000_base_vectors.fvec",
-                                                            "wikipedia_squad/100k/intfloat_e5-large-v2_100000_query_vectors_10000.fvec",
-                                                            "wikipedia_squad/100k/intfloat_e5-large-v2_100000_indices_query_10000.ivec"));
-            put("gecko-100k", new MultiFileDatasource("gecko-100k",
-                                                      "wikipedia_squad/100k/textembedding-gecko_100000_base_vectors.fvec",
-                                                      "wikipedia_squad/100k/textembedding-gecko_100000_query_vectors_10000.fvec",
-                                                      "wikipedia_squad/100k/textembedding-gecko_100000_indices_query_10000.ivec"));
-            put("gecko-1M", new MultiFileDatasource("gecko-1M",
-                    "wikipedia_squad/1M/textembedding-gecko_1000000_base_vectors.fvec",
-                    "wikipedia_squad/1M/textembedding-gecko_1000000_query_vectors_10000.fvec",
-                    "wikipedia_squad/1M/textembedding-gecko_1000000_indices_query_10000.ivec"));
-            put("dpr-1M", new MultiFileDatasource("dpr-1M",
-                    DATASET_HASH + "/dpr/c4-en_base_1M_norm_files0_2.fvecs",
-                    DATASET_HASH + "/dpr/c4-en_query_10k_norm_files0_1.fvecs",
-                    DATASET_HASH + "/dpr/dpr_1m_gt_norm_ip_k100.ivecs"));
-            put("dpr-10M", new MultiFileDatasource("dpr-10M",
-                    DATASET_HASH + "/dpr/c4-en_base_10M_norm_files0_2.fvecs",
-                    DATASET_HASH + "/dpr/c4-en_query_10k_norm_files0_1.fvecs",
-                    DATASET_HASH + "/dpr/dpr_10m_gt_norm_ip_k100.ivecs"));
-            put("cap-1M", new MultiFileDatasource("cap-1M",
-                    DATASET_HASH + "/cap/Caselaw_gte-Qwen2-1.5B_embeddings_base_1m_norm_shuffle.fvecs",
-                    DATASET_HASH + "/cap/Caselaw_gte-Qwen2-1.5B_embeddings_query_10k_norm_shuffle.fvecs",
-                    DATASET_HASH + "/cap/cap_1m_gt_norm_shuffle_ip_k100.ivecs"));
-            put("cap-6M", new MultiFileDatasource("cap-6M",
-                    DATASET_HASH + "/cap/Caselaw_gte-Qwen2-1.5B_embeddings_base_6m_norm_shuffle.fvecs",
-                    DATASET_HASH + "/cap/Caselaw_gte-Qwen2-1.5B_embeddings_query_10k_norm_shuffle.fvecs",
-                    DATASET_HASH + "/cap/cap_6m_gt_norm_shuffle_ip_k100.ivecs"));
-        }};
-    }
-}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderSimpleMFD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderSimpleMFD.java
new file mode 100644
index 000000000..5582e27e8
--- /dev/null
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderSimpleMFD.java
@@ -0,0 +1,928 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.github.jbellis.jvector.example.benchmarks.datasets;
+
+import io.github.jbellis.jvector.example.util.SiftLoader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.yaml.snakeyaml.Yaml;
+import software.amazon.awssdk.auth.credentials.AnonymousCredentialsProvider;
+import software.amazon.awssdk.regions.Region;
+import software.amazon.awssdk.services.s3.S3AsyncClient;
+import software.amazon.awssdk.transfer.s3.S3TransferManager;
+import software.amazon.awssdk.transfer.s3.model.CompletedFileDownload;
+import software.amazon.awssdk.transfer.s3.model.DownloadFileRequest;
+import software.amazon.awssdk.transfer.s3.model.FileDownload;
+import software.amazon.awssdk.transfer.s3.progress.LoggingTransferListener;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UncheckedIOException;
+import java.net.URI;
+import java.net.http.HttpClient;
+import java.net.http.HttpRequest;
+import java.net.http.HttpResponse;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
+import java.util.stream.Stream;
+
+/// A dataset loader that works with fvec/ivec datasets described by YAML catalog files
+/// matching {@code *.yaml} or {@code *.yml}.
+/// Supports S3, HTTP, local-only, and combined remote+local modes.
+///
+/// ### Catalog format
+///
+/// Each YAML catalog file lists datasets with their base, query, and ground truth
+/// files. Optional fields control where files are stored and fetched:
+///
+/// - {@code base_url} — overrides the default remote base URL for this entry
+/// - {@code cache_dir} — overrides where files are cached locally (relative or absolute path)
+///
+/// A special {@code _defaults} entry provides default values that are folded into all other
+/// entries (unless the entry already specifies a value). Any root key starting with {@code _}
+/// is excluded from dataset names.
+///
+/// The environment variable {@code DATASET_CACHE_DIR} sets a global default cache directory
+/// when no {@code cache_dir} is specified at any level.
+///
+/// Field values may contain {@code ${VAR}} references to environment variables, which are
+/// expanded at load time. The bash-style {@code ${VAR:-default}} syntax is supported to
+/// provide a fallback value when the variable is not set. An {@link IllegalArgumentException}
+/// is thrown if a referenced variable is not set and no default is provided.
+///
+/// A special {@code _include} entry can reference a remote catalog URL. The remote catalog
+/// is fetched and its raw contents are cached locally in a hidden snapshot file for offline use.
+/// On each run, the effective included entries are rebuilt by applying the local
+/// {@code _defaults} to the fetched (or cached) remote entries. Local entries in the same
+/// wrapper file are processed afterward and therefore take precedence over included remote entries.
+/// This lets a single local file act as a thin configuration wrapper around a remote catalog:
+/// ```yaml
+/// _defaults:
+///   cache_dir: ${DATASET_CACHE_DIR:-fvec}
+/// _include:
+///   url: s3://bucket/datasets-clean/catalog_entries.yaml
+/// ```
+///
+/// ```yaml
+/// _defaults:
+///   base_url: s3://my-bucket/${DATASET_HASH}/
+///   cache_dir: /data/cache
+///
+/// ada002-100k:
+///   base: ada_002_100k_base_99287.fvecs
+///   query: ada_002_100k_query_10000.fvecs
+///   gt: ada_002_100k_gt_ip_100.ivecs
+///
+/// # private dataset with its own remote source and cache location
+/// dpr-1M:
+///   base_url: s3://my-bucket/SECRET_HASH/dpr/
+///   cache_dir: /fast-ssd/dpr
+///   base: c4-en_base_1M_norm.fvecs
+///   query: c4-en_query_10k_norm.fvecs
+///   gt: dpr_1m_gt_norm_ip_k100.ivecs
+/// ```
+/// Filenames are resolved relative to the entry's cache directory (local) or the base URL (remote).
+/// When {@code base_url} is present on an entry, it is used instead of the loader's default remote
+/// base URL for that entry's files.
+///
+/// ### Usage patterns
+///
+/// **Remote with local caching** — files are downloaded on first use and cached locally.
+/// Subsequent runs use cached files. Set {@code checkForUpdates=true} to be warned when the
+/// remote catalog changes. Supports both HTTP and S3 URLs.
+/// ```java
+/// var loader = new DataSetLoaderSimpleMFD(
+///     "s3://bucket/datasets-clean/catalog_entries.yaml",
+///     "fvec/catalog_entries.yaml",    // local cache path
+///     true                            // warn if remote catalog differs from local
+/// );
+/// ```
+///
+/// **Local-only with recursive discovery** — the single-arg constructor accepts a directory
+/// and recursively scans it for all {@code .yaml}/{@code .yml} files. This lets you organise
+/// datasets in subdirectories, including private datasets with per-entry {@code base_url} overrides:
+/// ```
+/// local_datasets/
+///   mydatasets/
+///     user_entries.yaml               # your personal local datasets
+///   private-infra/
+///     private_entries.yaml            # private remote datasets with base_url per entry
+/// ```
+/// ```java
+/// var loader = new DataSetLoaderSimpleMFD("local_datasets");
+/// ```
+///
+/// **Remote+local hybrid** — if the local directory already contains {@code catalog_entries.yaml}
+/// and data files, they are used as-is. Missing data files are downloaded from the remote.
+/// ```java
+/// var loader = new DataSetLoaderSimpleMFD(
+///     "s3://bucket/datasets-clean/catalog_entries.yaml",
+///     "/data/datasets/catalog_entries.yaml",
+///     true
+/// );
+/// ```
+///
+/// ### Metadata
+///
+/// Dataset metadata (similarity function, load behavior) is resolved from
+/// {@code dataset-metadata.yml} via {@link DataSetMetadataReader}. A custom metadata reader
+/// can be provided via the 4-argument constructor.
+///
+/// @see DataSetLoader
+public class DataSetLoaderSimpleMFD implements DataSetLoader {
+
+    private static final Logger logger = LoggerFactory.getLogger(DataSetLoaderSimpleMFD.class);
+    private static final String DEFAULT_CATALOG_FILENAME = "catalog_entries.yaml";
+    private static final String CATALOG_GLOB = "*.{yaml,yml}";
+
+    // ========================================================================================
+    // LOG REDACTION — auto-redacts secret-like path segments to prevent leakage
+    // ========================================================================================
+
+    /// Minimum number of hex characters (ignoring separators) for a path segment to be
+    /// considered a potential secret (hash, API key, token, etc.).
+    private static final int MIN_HEX_CHARS = 20;
+
+    /// Set JVECTOR_LOG_REDACT=false to disable automatic redaction of secret-like path segments.
+    private static final boolean REDACT_ENABLED;
+    static {
+        String env = System.getenv("JVECTOR_LOG_REDACT");
+        REDACT_ENABLED = !"false".equalsIgnoreCase(env);
+    }
+
+    /// Redacts path segments that look like secrets (hashes, API keys, tokens) to prevent
+    /// accidental leakage in log output and exception messages.
+    ///
+    /// A path segment is redacted if it contains {@value #MIN_HEX_CHARS} or more hex
+    /// characters after stripping common separators ({@code -}, {@code .}, {@code _}) and
+    /// the {@code 0x} prefix. This catches SHA-1 (40), SHA-256 (64), API keys, and similar
+    /// patterns while preserving normal names like {@code datasets-clean} or {@code e5-base-v2-100k}.
+    ///
+    /// Set {@code JVECTOR_LOG_REDACT=false} to disable.
+    static String redact(Object value) {
+        if (value == null) return "null";
+        if (!REDACT_ENABLED) return value.toString();
+        String s = value.toString();
+        if (s.isEmpty()) return s;
+
+        var sb = new StringBuilder(s.length());
+        int i = 0;
+        while (i < s.length()) {
+            // find the next path segment (delimited by / or \)
+            int segStart = i;
+            while (i < s.length() && s.charAt(i) != '/' && s.charAt(i) != '\\') {
+                i++;
+            }
+            String segment = s.substring(segStart, i);
+            sb.append(looksLikeSecret(segment) ? "[[redacted]]" : segment);
+
+            // append the delimiter(s)
+            while (i < s.length() && (s.charAt(i) == '/' || s.charAt(i) == '\\')) {
+                sb.append(s.charAt(i));
+                i++;
+            }
+        }
+        return sb.toString();
+    }
+
+    /// Returns true if the segment looks like a hash, token, or API key.
+    /// Strips common separators and 0x prefix, then counts hex characters.
+    private static boolean looksLikeSecret(String segment) {
+        if (segment.isEmpty()) return false;
+
+        String stripped = segment;
+        // strip 0x or 0X prefix
+        if (stripped.startsWith("0x") || stripped.startsWith("0X")) {
+            stripped = stripped.substring(2);
+        }
+
+        int hexCount = 0;
+        int totalSignificant = 0; // non-separator characters
+        for (int i = 0; i < stripped.length(); i++) {
+            char c = stripped.charAt(i);
+            if (c == '-' || c == '.' || c == '_') continue; // ignore separators
+            totalSignificant++;
+            if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) {
+                hexCount++;
+            }
+        }
+
+        // must have enough hex chars and they must be the majority of significant chars
+        return hexCount >= MIN_HEX_CHARS && totalSignificant > 0
+                && (double) hexCount / totalSignificant >= 0.75;
+    }
+
+    /// Entry source. Local entries always take precedence over included remote entries.
+    private enum CatalogSource {
+        LOCAL,
+        INCLUDED_REMOTE
+    }
+
+    /// Resolved entry in the merged catalog. Tracks where the entry came from so that
+    /// local file resolution, precedence, and per-entry remote base URL overrides work correctly.
+    private static class CatalogEntry {
+        final Map<String, String> fields;
+        final Path cacheDir;       // where data files are cached locally
+        final String baseUrl;      // per-entry base_url override, or null
+        final CatalogSource source;
+
+        CatalogEntry(Map<String, String> fields, Path cacheDir, String baseUrl, CatalogSource source) {
+            this.fields = fields;
+            this.cacheDir = cacheDir;
+            this.baseUrl = baseUrl;
+            this.source = source;
+        }
+    }
+
+    private static final String ENV_DATASET_CACHE_DIR = "DATASET_CACHE_DIR";
+
+    private final String remoteBasePath;
+    private final Map<String, CatalogEntry> catalog;
+    private final Path localCacheDir;
+    private final DataSetMetadataReader metadata;
+    private final HttpClient httpClient;
+
+    // S3 instances for connection pooling
+    private S3AsyncClient s3Client;
+    private S3TransferManager s3TransferManager;
+
+    /// Creates a local-only loader that recursively discovers all {@code .yaml}/{@code .yml}
+    /// files under the given path.
+    ///
+    /// The {@code localPath} may be either a directory (scanned recursively for catalog files)
+    /// or the full path to a single catalog YAML file.
+    ///
+    /// If the path does not exist or contains no catalog files, the loader is constructed
+    /// successfully but will return empty for all dataset lookups. This allows it to be safely
+    /// registered in a loader list without failing when local datasets are not present.
+    ///
+    /// @param localPath the local directory to scan or full path to a catalog YAML file
+    public DataSetLoaderSimpleMFD(String localPath) {
+        this(null, localPath, false, DataSetMetadataReader.load());
+    }
+
+    /// Creates a loader using the default dataset metadata from {@code dataset-metadata.yml}.
+    ///
+    /// The {@code localPath} may be either a directory or the full path to a catalog YAML file.
+    /// If it ends in {@code .yaml} or {@code .yml}, that file is used as the catalog.
+    /// Otherwise, the directory is scanned recursively for all {@code .yaml}/{@code .yml} files.
+    ///
+    /// Entries without an explicit {@code cache_dir} default to {@code DATASET_CACHE_DIR}
+    /// when that environment variable is set; otherwise they default to the catalog file's
+    /// directory. In constructor-driven remote-catalog mode (when no local catalog exists and
+    /// {@code catalogUrl} is used), fetched remote entries default to {@code dataset_cache/}.
+    /// Entry-level and {@code _defaults}-level {@code cache_dir} values take precedence.
+    ///
+    /// @param catalogUrl      the full URL (HTTP or S3) to the remote catalog, or null/empty
+    ///                        for local-only mode
+    /// @param localPath       the local directory or full path to a catalog YAML file
+    /// @param checkForUpdates if true and a local catalog already exists, the remote catalog is
+    ///                        fetched and compared; a warning is logged if they differ
+    public DataSetLoaderSimpleMFD(String catalogUrl, String localPath, boolean checkForUpdates) {
+        this(catalogUrl, localPath, checkForUpdates, DataSetMetadataReader.load());
+    }
+
+    /// Creates a loader with a custom metadata reader for resolving dataset properties.
+    ///
+    /// @param catalogUrl      the full URL (HTTP or S3) to the remote catalog, or null/empty
+    ///                        for local-only mode
+    /// @param localPath       the local directory or full path to a catalog YAML file
+    /// @param checkForUpdates if true and a local catalog already exists, the remote catalog is
+    ///                        fetched and compared; a warning is logged if they differ.
+    ///                        Ignored when catalogUrl is null/empty.
+    /// @param metadata        the metadata reader for resolving dataset properties
+    public DataSetLoaderSimpleMFD(String catalogUrl, String localPath, boolean checkForUpdates, DataSetMetadataReader metadata) {
+        this.metadata = metadata;
+        this.httpClient = HttpClient.newBuilder()
+                .followRedirects(HttpClient.Redirect.NORMAL)
+                .build();
+
+        // resolve localPath for catalog discovery. For discovered local/include catalogs,
+        // entries without an explicit cache_dir fall back to DATASET_CACHE_DIR or the
+        // catalog file's directory. Pure constructor-driven remote catalogs fall back to
+        // dataset_cache.
+        Path resolvedPath = Paths.get(localPath);
+        Path localCatalog;
+        this.localCacheDir = Paths.get("dataset_cache");
+
+        if (localPath.endsWith(".yaml") || localPath.endsWith(".yml")) {
+            localCatalog = resolvedPath;
+        } else {
+            localCatalog = resolvedPath.resolve(DEFAULT_CATALOG_FILENAME);
+        }
+
+        // determine whether we have a remote URL (S3 or HTTP)
+        boolean isRemote = catalogUrl != null && !catalogUrl.isEmpty()
+                && (catalogUrl.startsWith("http://") || catalogUrl.startsWith("https://") || catalogUrl.startsWith("s3://"));
+
+        // derive remote base path by stripping the filename from the catalog URL
+        if (isRemote) {
+            int lastSlash = catalogUrl.lastIndexOf('/');
+            this.remoteBasePath = catalogUrl.substring(0, lastSlash + 1);
+        } else {
+            this.remoteBasePath = null;
+        }
+
+        // load local catalog entries — either from a single file or by scanning a directory tree
+        Map<String, CatalogEntry> localEntries = new HashMap<>();
+        if (localPath.endsWith(".yaml") || localPath.endsWith(".yml")) {
+            // single file mode
+            if (Files.exists(localCatalog)) {
+                loadCatalogEntries(localCatalog, localEntries);
+            }
+        } else if (Files.isDirectory(resolvedPath)) {
+            // recursive scan mode
+            scanForCatalogs(resolvedPath, localEntries);
+        } else if (Files.exists(localCatalog)) {
+            // directory doesn't exist yet but might after remote fetch — check the default file
+            loadCatalogEntries(localCatalog, localEntries);
+        }
+
+        if (!localEntries.isEmpty()) {
+            logger.info("Loaded {} datasets from local catalog(s) under {}", localEntries.size(), redact(localCacheDir));
+        }
+
+        if (isRemote) {
+            if (!localEntries.isEmpty()) {
+                this.catalog = localEntries;
+                if (checkForUpdates) checkRemoteCatalogForUpdates(catalogUrl, localEntries);
+            } else {
+                logger.info("No local catalog found, fetching from {}", redact(catalogUrl));
+                var remoteCatalogData = fetchRemoteCatalogRaw(catalogUrl);
+                this.catalog = toCatalogEntries(remoteCatalogData, localCacheDir);
+                saveCatalogLocally(localCatalog, catalogUrl, remoteCatalogData);
+            }
+        } else {
+            if (!localEntries.isEmpty()) {
+                this.catalog = localEntries;
+            } else {
+                logger.info("No catalog found under {}. This loader will not match any datasets.", redact(localCacheDir));
+                this.catalog = Map.of();
+            }
+        }
+    }
+
+    @Override
+    public Optional<DataSetInfo> loadDataSet(String dataSetName) {
+        var entry = catalog.get(dataSetName);
+        if (entry == null) return Optional.empty();
+
+        var baseFile = entry.fields.get("base");
+        var queryFile = entry.fields.get("query");
+        var gtFile = entry.fields.get("gt");
+        if (baseFile == null || queryFile == null || gtFile == null) {
+            logger.error("Dataset '{}' is missing required fields (base, query, gt) in catalog", dataSetName);
+            return Optional.empty();
+        }
+
+        logger.info("Found dataset '{}' in catalog", dataSetName);
+        var startTime = System.nanoTime();
+
+        // determine the effective remote base URL and local cache directory for this entry
+        String effectiveBaseUrl = entry.baseUrl != null ? entry.baseUrl : remoteBasePath;
+        Path effectiveCacheDir = entry.cacheDir;
+
+        // Execute downloads simultaneously to maximize network bandwidth
+        try {
+            var f1 = CompletableFuture.runAsync(() -> ensureQuietly(baseFile, effectiveCacheDir, effectiveBaseUrl));
+            var f2 = CompletableFuture.runAsync(() -> ensureQuietly(queryFile, effectiveCacheDir, effectiveBaseUrl));
+            var f3 = CompletableFuture.runAsync(() -> ensureQuietly(gtFile, effectiveCacheDir, effectiveBaseUrl));
+
+            CompletableFuture.allOf(f1, f2, f3).join();
+        } catch (Exception e) {
+            throw new RuntimeException("Failed to obtain dataset files for " + dataSetName, e);
+        }
+
+        logger.info("Dataset files ready for '{}' in {}s", dataSetName, String.format("%.2f", (System.nanoTime() - startTime) / 1e9));
+
+        var props = metadata.getProperties(dataSetName)
+                .orElseThrow(() -> new IllegalArgumentException(
+                        String.format(
+                                "Dataset '%s' was found in dataset catalog, but no metadata entry was found in dataset-metadata.yml. ",
+                                dataSetName)));
+        return Optional.of(new DataSetInfo(props, () -> {
+            var baseVectors = SiftLoader.readFvecs(effectiveCacheDir.resolve(baseFile).toString());
+            var queryVectors = SiftLoader.readFvecs(effectiveCacheDir.resolve(queryFile).toString());
+            var gtVectors = SiftLoader.readIvecs(effectiveCacheDir.resolve(gtFile).toString());
+            return DataSetUtils.processDataSet(dataSetName, props, baseVectors, queryVectors, gtVectors);
+        }));
+    }
+
+    // ========================================================================================
+    // CATALOG DISCOVERY & LOADING
+    // ========================================================================================
+
+    /// Returns the effective source for a discovered catalog file.
+    /// Generated remote-catalog snapshots are treated as included remote entries so that
+    /// real local catalogs continue to take precedence across runs.
+    private static CatalogSource catalogSource(Map<String, Map<String, String>> raw) {
+        Map<String, String> meta = raw.get("_meta");
+        if (meta != null && "true".equalsIgnoreCase(meta.get("generated_remote_catalog"))) {
+            return CatalogSource.INCLUDED_REMOTE;
+        }
+        return CatalogSource.LOCAL;
+    }
+
+    /// Inserts an entry while preserving the precedence rule that real local entries
+    /// always win over included remote entries.
+    private static void putCatalogEntry(Map<String, CatalogEntry> target, String name, CatalogEntry entry) {
+        CatalogEntry existing = target.get(name);
+        if (existing == null || entry.source == CatalogSource.LOCAL || existing.source != CatalogSource.LOCAL) {
+            target.put(name, entry);
+        }
+    }
+
+    /// Returns the hidden cache file used to persist the raw contents of an included remote catalog.
+    private static Path includeCacheFile(Path catalogDir, String includeUrl) {
+        return catalogDir.resolve(".catalog-cache")
+                .resolve("include-" + sha256Hex(includeUrl) + ".yaml.cache");
+    }
+
+    private static String sha256Hex(String value) {
+        try {
+            MessageDigest digest = MessageDigest.getInstance("SHA-256");
+            byte[] bytes = digest.digest(value.getBytes(StandardCharsets.UTF_8));
+            StringBuilder hex = new StringBuilder(bytes.length * 2);
+            for (byte b : bytes) {
+                hex.append(Character.forDigit((b >> 4) & 0xF, 16));
+                hex.append(Character.forDigit(b & 0xF, 16));
+            }
+            return hex.toString();
+        } catch (NoSuchAlgorithmException e) {
+            throw new IllegalStateException("SHA-256 should always be available", e);
+        }
+    }
+
+    /// Recursively scans a directory tree for {@code .yaml}/{@code .yml} files and merges
+    /// all entries into the given map. Later entries of the same source type may override
+    /// earlier ones, but real local entries always take precedence over included remote entries.
+    private void scanForCatalogs(Path rootDir, Map<String, CatalogEntry> target) {
+        try (Stream<Path> paths = Files.walk(rootDir)) {
+            var matcher = rootDir.getFileSystem().getPathMatcher("glob:" + CATALOG_GLOB);
+            paths.filter(p -> p.getFileName() != null && matcher.matches(p.getFileName()))
+                    .forEach(catalogFile -> loadCatalogEntries(catalogFile, target));
+        } catch (IOException e) {
+            logger.warn("Error scanning for catalogs under {}: {}", redact(rootDir), redact(e.getMessage()));
+        }
+    }
+
+    /// Loads entries from a single catalog file into the target map.
+    /// Handles {@code _defaults} folding, {@code _include} remote fetching, and
+    /// {@code _}-prefixed key exclusion.
+    ///
+    /// When {@code _include} is present, its value (after env var expansion) is treated as a
+    /// remote catalog URL. The remote entries are fetched and merged with the local defaults,
+    /// so a single local file can act as a thin wrapper around a remote catalog.
+    private void loadCatalogEntries(Path catalogFile, Map<String, CatalogEntry> target) {
+        var raw = loadCatalogFromFile(catalogFile);
+        if (raw.isEmpty()) return;
+
+        Path catalogDir = catalogFile.getParent() != null ? catalogFile.getParent() : Paths.get(".");
+        CatalogSource source = catalogSource(raw);
+
+        // extract and expand _defaults if present
+        Map<String, String> defaults = raw.getOrDefault("_defaults", Map.of());
+        if (!defaults.isEmpty()) {
+            defaults = resolveEnvVars(defaults);
+        }
+
+        // handle _include: fetch remote catalog and merge with local defaults
+        Map<String, String> includeEntry = raw.get("_include");
+        if (includeEntry != null) {
+            String includeUrl = includeEntry.get("url");
+            if (includeUrl != null) {
+                includeUrl = expandEnvVars(includeUrl);
+                loadRemoteInclude(includeUrl, defaults, catalogDir, includeCacheFile(catalogDir, includeUrl), target);
+            }
+        }
+
+        // count real entries (non-underscore keys)
+        long entryCount = raw.keySet().stream().filter(k -> !k.startsWith("_")).count();
+        if (entryCount > 0) {
+            logger.info("Loading catalog from {} ({} entries)", redact(catalogFile), entryCount);
+        }
+
+        for (var e : raw.entrySet()) {
+            String name = e.getKey();
+            // skip entries whose key starts with _
+            if (name.startsWith("_")) continue;
+
+            // fold defaults into this entry (entry values take precedence)
+            Map<String, String> fields = new HashMap<>(defaults);
+            if (e.getValue() != null) {
+                fields.putAll(e.getValue());
+            }
+
+            putCatalogEntry(target, name, buildCatalogEntry(fields, catalogDir, source));
+        }
+    }
+
+    /// Fetches a remote catalog via {@code _include}, caches its raw contents locally for
+    /// offline reuse, and merges the resulting entries with the local defaults. If the remote
+    /// fetch fails and a cached snapshot exists, the cached catalog is used instead.
+    private void loadRemoteInclude(String includeUrl, Map<String, String> defaults,
+                                   Path catalogDir, Path cachedIncludeFile,
+                                   Map<String, CatalogEntry> target) {
+        Map<String, Map<String, String>> remoteCatalog;
+        boolean usedCachedSnapshot = false;
+
+        try {
+            logger.info("Including remote catalog from {}", redact(includeUrl));
+            remoteCatalog = fetchRemoteCatalogRaw(includeUrl, cachedIncludeFile);
+        } catch (Exception e) {
+            if (!Files.isRegularFile(cachedIncludeFile)) {
+                logger.warn("Failed to include remote catalog from {}: {}", redact(includeUrl), redact(e.getMessage()));
+                return;
+            }
+
+            logger.warn("Failed to include remote catalog from {}: {}. Using cached catalog {}",
+                    redact(includeUrl), redact(e.getMessage()), redact(cachedIncludeFile));
+            remoteCatalog = loadCatalogFromFile(cachedIncludeFile);
+            usedCachedSnapshot = true;
+        }
+
+        // derive the remote base path from the include URL
+        int lastSlash = includeUrl.lastIndexOf('/');
+        String remoteBase = lastSlash >= 0 ? includeUrl.substring(0, lastSlash + 1) : null;
+
+        long entryCount = 0;
+        for (var e : remoteCatalog.entrySet()) {
+            if (e.getKey().startsWith("_")) continue;
+            entryCount++;
+
+            // fold local defaults into remote entry (remote values take precedence over defaults,
+            // but local entries always take precedence — those are handled in the caller's loop)
+            Map<String, String> fields = new HashMap<>(defaults);
+            if (e.getValue() != null) {
+                fields.putAll(e.getValue());
+            }
+            // if the entry doesn't already have a base_url, use the remote catalog's base path
+            if (!fields.containsKey("base_url") && remoteBase != null) {
+                fields.put("base_url", remoteBase);
+            }
+
+            putCatalogEntry(target, e.getKey(), buildCatalogEntry(fields, catalogDir, CatalogSource.INCLUDED_REMOTE));
+        }
+
+        logger.info("Included {} datasets from {} catalog", entryCount,
+                usedCachedSnapshot ? "cached" : "remote");
+    }
+
+    /// Converts a raw catalog map (from a remote fetch) into CatalogEntry objects.
+    /// Handles {@code _defaults} folding and {@code _}-prefixed key exclusion.
+    private static Map<String, CatalogEntry> toCatalogEntries(Map<String, Map<String, String>> raw, Path localDir) {
+        Map<String, String> defaults = raw.getOrDefault("_defaults", Map.of());
+
+        var result = new HashMap<String, CatalogEntry>();
+        for (var e : raw.entrySet()) {
+            if (e.getKey().startsWith("_")) continue;
+
+            Map<String, String> fields = new HashMap<>(defaults);
+            if (e.getValue() != null) {
+                fields.putAll(e.getValue());
+            }
+
+            putCatalogEntry(result, e.getKey(), buildCatalogEntry(fields, localDir, CatalogSource.INCLUDED_REMOTE));
+        }
+        return result;
+    }
+
+    private static final java.util.Set<String> KNOWN_FIELDS = java.util.Set.of(
+            "base", "query", "gt", "base_url", "cache_dir"
+    );
+
+    /// Builds a CatalogEntry from merged fields, resolving env vars, base_url, and cache_dir.
+    /// Throws if any unknown fields are present.
+    private static CatalogEntry buildCatalogEntry(Map<String, String> fields, Path catalogDir, CatalogSource source) {
+        // validate that all fields are recognized
+        for (String key : fields.keySet()) {
+            if (!KNOWN_FIELDS.contains(key)) {
+                throw new IllegalArgumentException(
+                        "Unknown field '" + key + "' in catalog entry. Known fields: " + KNOWN_FIELDS);
+            }
+        }
+
+        // expand ${VAR} references in all field values
+        var resolved = resolveEnvVars(fields);
+
+        String baseUrl = resolved.get("base_url");
+        if (baseUrl != null && !baseUrl.endsWith("/")) {
+            baseUrl = baseUrl + "/";
+        }
+
+        // resolve cache_dir: entry field > DATASET_CACHE_DIR env var > catalog file's directory
+        Path cacheDir;
+        String cacheDirField = resolved.get("cache_dir");
+        if (cacheDirField != null && !cacheDirField.isEmpty()) {
+            cacheDir = Paths.get(cacheDirField);
+        } else {
+            String envCacheDir = System.getenv(ENV_DATASET_CACHE_DIR);
+            if (envCacheDir != null && !envCacheDir.isEmpty()) {
+                cacheDir = Paths.get(envCacheDir);
+            } else {
+                cacheDir = catalogDir;
+            }
+        }
+
+        return new CatalogEntry(resolved, cacheDir, baseUrl, source);
+    }
+
+    /// Matches {@code ${VAR}} and {@code ${VAR:-default}} syntax.
+    private static final java.util.regex.Pattern ENV_VAR_PATTERN =
+            java.util.regex.Pattern.compile("\\$\\{([^:}]+)(?::-((?:[^}]*)?))?}");
+
+    /// Expands {@code ${VAR}} and {@code ${VAR:-default}} references in all field values
+    /// using environment variables. Throws {@link IllegalArgumentException} if a referenced
+    /// variable is not set and no default is provided.
+    private static Map<String, String> resolveEnvVars(Map<String, String> fields) {
+        var resolved = new HashMap<String, String>(fields.size());
+        for (var e : fields.entrySet()) {
+            resolved.put(e.getKey(), expandEnvVars(e.getValue()));
+        }
+        return resolved;
+    }
+
+    /// Expands all {@code ${VAR}} and {@code ${VAR:-default}} occurrences in a single string value.
+    private static String expandEnvVars(String value) {
+        if (value == null || !value.contains("${")) {
+            return value;
+        }
+        var matcher = ENV_VAR_PATTERN.matcher(value);
+        var sb = new StringBuilder();
+        while (matcher.find()) {
+            String varName = matcher.group(1);
+            String defaultValue = matcher.group(2); // null if no :- was present
+            String envValue = System.getenv(varName);
+            if (envValue == null) {
+                if (defaultValue != null) {
+                    envValue = defaultValue;
+                } else {
+                    throw new IllegalArgumentException(
+                            "Environment variable '${" + varName + "}' referenced in catalog entry is not set");
+                }
+            }
+            matcher.appendReplacement(sb, java.util.regex.Matcher.quoteReplacement(envValue));
+        }
+        matcher.appendTail(sb);
+        return sb.toString();
+    }
+
+    // ========================================================================================
+    // FILE AVAILABILITY
+    // ========================================================================================
+
+    private void ensureQuietly(String filename, Path cacheDir, String baseUrl) {
+        try {
+            ensureFileAvailable(filename, cacheDir, baseUrl);
+        } catch (IOException e) {
+            throw new UncheckedIOException(e);
+        }
+    }
+
+    /// Ensures a dataset file is available locally. Checks in the entry's cache directory first.
+    /// If not found and a remote base URL is available (either per-entry or loader-level),
+    /// downloads the file.
+    private void ensureFileAvailable(String filename, Path cacheDir, String baseUrl) throws IOException {
+        Path localPath = cacheDir.resolve(filename);
+        if (Files.exists(localPath)) return;
+        if (baseUrl == null) throw new IOException("File not found locally and no remote URL configured: " + redact(localPath));
+
+        Path parent = localPath.getParent();
+        if (parent != null) {
+            Files.createDirectories(parent);
+        }
+
+        String url = baseUrl + filename;
+        logger.info("Downloading {} -> {}", redact(url), redact(localPath));
+        downloadUrlToFile(url, localPath);
+    }
+
+    // ========================================================================================
+    // REMOTE CATALOG OPERATIONS
+    // ========================================================================================
+
+    private Map<String, Map<String, String>> fetchRemoteCatalogRaw(String catalogUrl) {
+        return fetchRemoteCatalogRaw(catalogUrl, null);
+    }
+
+    private Map<String, Map<String, String>> fetchRemoteCatalogRaw(String catalogUrl, Path snapshotFile) {
+        try {
+            Path tempDir = snapshotFile != null && snapshotFile.getParent() != null
+                    ? snapshotFile.getParent()
+                    : null;
+            if (tempDir != null) {
+                Files.createDirectories(tempDir);
+            }
+
+            Path tempFile = tempDir != null
+                    ? Files.createTempFile(tempDir, "catalog-", ".tmp")
+                    : Files.createTempFile("catalog-", ".tmp");
+            try {
+                downloadUrlToFile(catalogUrl, tempFile);
+                var catalog = loadCatalogFromFile(tempFile);
+
+                if (snapshotFile != null) {
+                    Files.move(tempFile, snapshotFile,
+                            StandardCopyOption.ATOMIC_MOVE,
+                            StandardCopyOption.REPLACE_EXISTING);
+                }
+                return catalog;
+            } finally {
+                Files.deleteIfExists(tempFile);
+            }
+        } catch (IOException e) {
+            throw new RuntimeException("Failed to fetch dataset catalog from " + redact(catalogUrl), e);
+        }
+    }
+
+    private void saveCatalogLocally(Path localCatalog, String catalogUrl,
+                                    Map<String, Map<String, String>> catalogData) {
+        try {
+            Path parent = localCatalog.getParent() != null ? localCatalog.getParent() : Paths.get(".");
+            Files.createDirectories(parent);
+
+            Path tempFile = Files.createTempFile(parent, "catalog-", ".tmp");
+            try {
+                Map<String, Map<String, String>> annotated = new LinkedHashMap<>();
+                Map<String, String> meta = new LinkedHashMap<>();
+                meta.put("generated_remote_catalog", "true");
+                meta.put("remote_catalog_url", catalogUrl);
+                annotated.put("_meta", meta);
+                annotated.putAll(catalogData);
+
+                Files.writeString(tempFile, new Yaml().dump(annotated));
+                Files.move(tempFile, localCatalog,
+                        StandardCopyOption.ATOMIC_MOVE,
+                        StandardCopyOption.REPLACE_EXISTING);
+            } finally {
+                Files.deleteIfExists(tempFile);
+            }
+        } catch (Exception e) {
+            logger.warn("Failed to cache catalog locally: {}", redact(e.getMessage()));
+        }
+    }
+
+    @SuppressWarnings("unchecked")
+    private static Map<String, Map<String, String>> loadCatalogFromFile(Path path) {
+        try (InputStream in = Files.newInputStream(path)) {
+            Map<String, Map<String, String>> result = new Yaml().load(in);
+            return result != null ? result : Map.of();
+        } catch (IOException e) {
+            throw new RuntimeException("Failed to load catalog from " + redact(path), e);
+        }
+    }
+
+    /// Fetches the remote catalog and compares it to the local one, logging a warning if they differ.
+    private void checkRemoteCatalogForUpdates(String catalogUrl, Map<String, CatalogEntry> localEntries) {
+        try {
+            var remoteCatalogData = fetchRemoteCatalogRaw(catalogUrl);
+            // compare just the dataset names and file fields, ignoring localDir
+            boolean differs = false;
+            if (remoteCatalogData.size() != localEntries.size()) {
+                differs = true;
+            } else {
+                for (var e : remoteCatalogData.entrySet()) {
+                    var local = localEntries.get(e.getKey());
+                    if (local == null || !local.fields.equals(e.getValue())) {
+                        differs = true;
+                        break;
+                    }
+                }
+            }
+            if (differs) {
+                logger.warn("Remote catalog at {} differs from local catalog. Consider updating your local copy.", redact(catalogUrl));
+            }
+        } catch (Exception e) {
+            logger.warn("Could not check remote catalog for updates: {}", redact(e.getMessage()));
+        }
+    }
+
+    // ========================================================================================
+    // TRANSPORT PROTOCOL ROUTING (S3 vs HTTP)
+    // ========================================================================================
+
+    private void downloadUrlToFile(String url, Path localPath) throws IOException {
+        if (url.startsWith("s3://")) {
+            downloadFileS3(url, localPath);
+        } else if (url.startsWith("http://") || url.startsWith("https://")) {
+            downloadFileHttp(url, localPath);
+        } else {
+            throw new IllegalArgumentException("Unsupported URL scheme for download: " + redact(url));
+        }
+    }
+
+    // ========================================================================================
+    // S3 TRANSFER MANAGER IMPLEMENTATION
+    // ========================================================================================
+
+    private synchronized S3TransferManager getS3TransferManager() {
+        if (s3TransferManager == null) {
+            s3Client = s3AsyncClient();
+            s3TransferManager = S3TransferManager.builder().s3Client(s3Client).build();
+        }
+        return s3TransferManager;
+    }
+
+    private void downloadFileS3(String s3Url, Path localPath) throws IOException {
+        String withoutScheme = s3Url.substring(5);
+        int slashIdx = withoutScheme.indexOf('/');
+        String bucket = withoutScheme.substring(0, slashIdx);
+        String key = withoutScheme.substring(slashIdx + 1);
+
+        S3TransferManager tm = getS3TransferManager();
+
+        DownloadFileRequest request = DownloadFileRequest.builder()
+                .getObjectRequest(b -> b.bucket(bucket).key(key))
+                .addTransferListener(LoggingTransferListener.create())
+                .destination(localPath)
+                .build();
+
+        boolean downloaded = false;
+        for (int i = 0; i < 3; i++) { // 3 retries
+            try {
+                FileDownload downloadFile = tm.downloadFile(request);
+                CompletedFileDownload result = downloadFile.completionFuture().join();
+                long downloadedSize = Files.size(localPath);
+                Long expectedSize = result.response().contentLength();
+
+                // Null check prevents NullPointerException during unboxing.
+                // If expectedSize is null, we trust the transfer manager's successful completion.
+                if (expectedSize != null && downloadedSize != expectedSize) {
+                    logger.error("Incomplete download (got {} of {} bytes). Retrying...", downloadedSize, expectedSize);
+                    Files.deleteIfExists(localPath);
+                    continue;
+                }
+
+                downloaded = true;
+                break;
+            } catch (Exception e) {
+                logger.error("Download attempt {} failed for {}: {}", i + 1, redact(key), redact(e.getMessage()));
+                Files.deleteIfExists(localPath);
+            }
+        }
+        if (!downloaded) {
+            throw new IOException("Failed to download " + redact(s3Url) + " after 3 attempts");
+        }
+    }
+
+    private static S3AsyncClient s3AsyncClient() {
+        return S3AsyncClient.crtBuilder()
+                .region(Region.US_EAST_1)
+                .credentialsProvider(AnonymousCredentialsProvider.create())
+                .targetThroughputInGbps(10.0)
+                .minimumPartSizeInBytes(8L * 1024 * 1024)
+                .build();
+    }
+
+    // ========================================================================================
+    // HTTP CLIENT IMPLEMENTATION
+    // ========================================================================================
+
+    private void downloadFileHttp(String url, Path localPath) throws IOException {
+        var request = HttpRequest.newBuilder().uri(URI.create(url)).GET().build();
+
+        Path targetDir = localPath.toAbsolutePath().getParent();
+        if (targetDir != null) {
+            Files.createDirectories(targetDir);
+        }
+        Path tempFile = Files.createTempFile(targetDir, "download-", ".tmp");
+
+        try {
+            var response = httpClient.send(request, HttpResponse.BodyHandlers.ofFile(tempFile));
+            if (response.statusCode() != 200) {
+                throw new IOException("HTTP " + response.statusCode() + " downloading " + redact(url));
+            }
+            Files.move(tempFile, localPath, StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
+        } catch (InterruptedException e) {
+            Thread.currentThread().interrupt();
+            Files.deleteIfExists(tempFile);
+            throw new IOException("Interrupted downloading " + redact(url), e);
+        } catch (Exception e) {
+            Files.deleteIfExists(tempFile);
+            throw e;
+        }
+    }
+}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java
index 93ace9249..3207e492f 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java
@@ -21,6 +21,9 @@
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Optional;
@@ -28,7 +31,7 @@
 /// Reads dataset metadata from a multi-entry YAML file and provides keyed lookups
 /// for {@link DataSetProperties}.
 ///
-/// This is used by loaders such as {@link DataSetLoaderMFD} and {@link DataSetLoaderHDF5}
+/// This is used by loaders such as {@link DataSetLoaderSimpleMFD}
 /// that do not have an intrinsic way to determine the similarity function from the dataset
 /// name or file format alone.
 ///
@@ -48,7 +51,8 @@
 /// the exact key first, then falls back to the key with {@code .hdf5} appended.
 public class DataSetMetadataReader {
 
-    private static final String DEFAULT_FILE = "jvector-examples/yaml-configs/dataset_metadata.yml";
+    private static final String DEFAULT_FILE = "jvector-examples/yaml-configs/dataset-metadata.yml";
+    private static final String MODULE_RELATIVE_DEFAULT_FILE = "yaml-configs/dataset-metadata.yml";
 
     private final Map<String, Map<String, Object>> metadata;
 
@@ -56,12 +60,26 @@ private DataSetMetadataReader(Map<String, Map<String, Object>> metadata) {
         this.metadata = metadata != null ? metadata : Map.of();
     }
 
-    /// Loads dataset metadata from the default file ({@code jvector-examples/yaml-configs/dataset_metadata.yml}).
+    /// Loads dataset metadata from the default file ({@code jvector-examples/yaml-configs/dataset-metadata.yml}).
     ///
     /// @return the loaded metadata
     /// @throws RuntimeException if the file cannot be read
     public static DataSetMetadataReader load() {
-        return load(DEFAULT_FILE);
+        Path defaultPath = Paths.get(DEFAULT_FILE);
+        if (Files.isRegularFile(defaultPath)) {
+            return load(defaultPath.toString());
+        }
+
+        Path moduleRelativePath = Paths.get(MODULE_RELATIVE_DEFAULT_FILE);
+        if (Files.isRegularFile(moduleRelativePath)) {
+            return load(moduleRelativePath.toString());
+        }
+
+        throw new RuntimeException(
+                "Failed to load dataset metadata from default locations: "
+                        + defaultPath.toAbsolutePath().normalize()
+                        + " or "
+                        + moduleRelativePath.toAbsolutePath().normalize());
     }
 
     /// Loads dataset metadata from the specified file.
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java
index 5ae1cf2e6..6d017d899 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java
@@ -150,7 +150,7 @@ default boolean isValid() {
     /// ));
     ///
     /// // From a YAML file, selecting a named entry
-    /// var props = new DataSetProperties.PropertyMap("dataset_metadata.yml", "ada002-100k");
+    /// var props = new DataSetProperties.PropertyMap("dataset-metadata.yml", "ada002-100k");
     ///
     /// // From a flat YAML file (no top-level key)
     /// var props = new DataSetProperties.PropertyMap("my_dataset.yml", null);
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java
index 449ff4fc6..94b2e1c77 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java
@@ -37,8 +37,15 @@ public class DataSets {
     private static final Logger logger = LoggerFactory.getLogger(DataSets.class);
 
     public static final List<DataSetLoader> defaultLoaders = new ArrayList<>() {{
-        add(new DataSetLoaderHDF5());
-        add(new DataSetLoaderMFD());
+
+        /// Scans the jvector-examples/yaml-configs/dataset-catalogs/ directory for .yaml/.yml files.
+        ///
+        /// To add your own datasets:
+        /// 1. Add a .yaml file with your dataset mappings (see local-catalog.yaml for examples)
+        /// 2. For private remote datasets, use baseurl with ${SECRET_HASH} style env vars
+        ///
+        add(new DataSetLoaderSimpleMFD("jvector-examples/yaml-configs/dataset-catalogs"));
+
     }};
 
     /// Loads a dataset by name using the {@link #defaultLoaders}.
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunArtifacts.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunArtifacts.java
index 77e180f4f..5cb2ebcde 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunArtifacts.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunArtifacts.java
@@ -17,7 +17,6 @@
 package io.github.jbellis.jvector.example.reporting;
 
 import io.github.jbellis.jvector.example.benchmarks.datasets.DataSet;
-import io.github.jbellis.jvector.example.benchmarks.datasets.DataSetLoaderMFD;
 import io.github.jbellis.jvector.example.benchmarks.Metric;
 import io.github.jbellis.jvector.example.yaml.MultiConfig;
 import io.github.jbellis.jvector.example.yaml.MetricSelection;
@@ -39,7 +38,7 @@
  * - sys_info.json (RunReporting)
  * - dataset_info.csv (DatasetInfoWriter)
  * - experiments.csv (ExperimentsCsvWriter)
- * - run-level compute/display/log selections from run.yml
+ * - run-level compute/display/log selections from run-config.yml
  */
 public final class RunArtifacts {
 
@@ -242,17 +241,6 @@ public void registerDataset(String datasetName, DataSet ds) throws IOException {
             return; // disabled
         }
 
-        var mfd = DataSetLoaderMFD.MultiFileDatasource.byName.get(datasetName);
-
-        String basePath = "";
-        String queryPath = "";
-        String gtPath = "";
-        if (mfd != null) {
-            basePath = Paths.get("fvec").resolve(mfd.basePath).toAbsolutePath().toString();
-            queryPath = Paths.get("fvec").resolve(mfd.queriesPath).toAbsolutePath().toString();
-            gtPath = Paths.get("fvec").resolve(mfd.groundTruthPath).toAbsolutePath().toString();
-        }
-
-        datasetInfoWriter.register(DatasetInfoWriter.fromDataSet(datasetName, basePath, queryPath, gtPath, ds));
+        datasetInfoWriter.register(DatasetInfoWriter.fromDataSet(datasetName, "", "", "", ds));
     }
 }
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunReporting.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunReporting.java
index 5d6dca98c..9cb21e24a 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunReporting.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunReporting.java
@@ -29,7 +29,7 @@
 /**
  * Bootstraps a benchmark run directory and writes sys_info.json.
  *
- * This class creates a run_id/run_uuid, selects the logging directory from run.yml, captures basic
+ * This class creates a run_id/run_uuid, selects the logging directory from run-config.yml, captures basic
  * environment metadata (OS/JVM/CPU/SIMD/threads/memory), computes a stable system_id, and returns a
  * {@link RunContext} for downstream writers (dataset_info.csv, experiments.csv).
  */
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/SearchSelection.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/SearchSelection.java
index fbe3e96c4..814c862b2 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/SearchSelection.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/SearchSelection.java
@@ -26,7 +26,7 @@
  * Encapsulates selection + resolution + warning + application for a single sink (console/logging)
  * in the search phase, using {@link ReportingSelectionResolver} and {@link SearchReportingCatalog}.
  *
- * Selections are typically run-level (from run.yml via {@link io.github.jbellis.jvector.example.yaml.RunConfig}).
+ * Selections are typically run-level (from run-config.yml via {@link io.github.jbellis.jvector.example.yaml.RunConfig}).
  *
  * This prevents call-site ordering mistakes (validate -> resolve -> warn -> apply).
  */
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/MultiConfig.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/MultiConfig.java
index 0f2b79805..6e56a6e26 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/MultiConfig.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/MultiConfig.java
@@ -37,7 +37,7 @@ public class MultiConfig {
     public SearchParameters search;
     public String dataset;
 
-    private static final String defaultDirectory = "jvector-examples/yaml-configs/";
+    private static final String defaultDirectory = "jvector-examples/yaml-configs/index-parameters/";
     private static final java.util.regex.Pattern YAML_SCHEMA_VERSION_KEY =
             java.util.regex.Pattern.compile("(?m)^\\s*yamlSchemaVersion\\s*:");
 
@@ -127,7 +127,7 @@ static MultiConfig getConfig(File configFile) throws FileNotFoundException {
         // Legacy yamlSchemaVersion "0": lenient parsing (ignore unknown fields like search.benchmarks)
         if (WARNED_LEGACY.compareAndSet(false, true)) {
             System.err.println("WARNING: Deprecated legacy YAML schema detected (no yamlSchemaVersion). "
-                    + "Unknown fields will be ignored. Please migrate configs to yamlSchemaVersion: 1 and run.yml.");
+                    + "Unknown fields will be ignored. Please migrate configs to yamlSchemaVersion: 1 and run-config.yml.");
         }
 
         try {
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/RunConfig.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/RunConfig.java
index b65dda082..8711007aa 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/RunConfig.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/RunConfig.java
@@ -26,7 +26,7 @@
 import java.util.Map;
 
 /**
- * Run-level configuration loaded from yaml-configs/run.yml.
+ * Run-level configuration loaded from yaml-configs/run-config.yml.
  *
  * This controls:
  * - benchmark computation (benchmarks)
@@ -36,7 +36,7 @@
  */
 public class RunConfig {
     private static final String defaultDirectory = "jvector-examples/yaml-configs/";
-    private static final String defaultRunFile = "run.yml";
+    private static final String defaultRunFile = "run-config.yml";
 
     public int yamlSchemaVersion;
     public int onDiskIndexVersion;
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/SearchParameters.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/SearchParameters.java
index e636a3ce9..dabce268a 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/SearchParameters.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/SearchParameters.java
@@ -23,6 +23,6 @@ public class SearchParameters extends CommonParameters {
     public Map<Integer, List<Double>> topKOverquery;
     public List<Boolean> useSearchPruning;
 
-    // NOTE: benchmark compute + console/logging selection are now run-level (run.yml)
+    // NOTE: benchmark compute + console/logging selection are now run-level (run-config.yml)
     // and are no longer recognized in dataset configs.
 }
diff --git a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderSimpleMFDTest.java b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderSimpleMFDTest.java
new file mode 100644
index 000000000..33379dd57
--- /dev/null
+++ b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderSimpleMFDTest.java
@@ -0,0 +1,1514 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.github.jbellis.jvector.example.benchmarks.datasets;
+
+import com.sun.net.httpserver.HttpExchange;
+import com.sun.net.httpserver.HttpServer;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.net.InetSocketAddress;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+/// Tests for {@link DataSetLoaderSimpleMFD} using local files only, with no remote endpoint.
+public class DataSetLoaderSimpleMFDTest {
+
+    @Rule
+    public TemporaryFolder tempFolder = new TemporaryFolder();
+
+    private Path cacheDir;
+    private DataSetMetadataReader testMetadata;
+
+    /// Returns the name of an environment variable that is reliably set on all platforms.
+    /// On Unix this is typically HOME; on Windows it is typically USERPROFILE or PATH.
+    private static String findReliableEnvVar() {
+        for (String name : new String[] {"HOME", "USERPROFILE", "PATH"}) {
+            if (System.getenv(name) != null) return name;
+        }
+        throw new AssertionError("Could not find any set environment variable for testing");
+    }
+
+    @Before
+    public void setUp() throws IOException {
+        cacheDir = tempFolder.newFolder("datasets").toPath();
+
+        // create a test-only metadata file
+        Path metadataFile = tempFolder.newFile("test_metadata.yml").toPath();
+        Files.writeString(metadataFile,
+                "test-ds:\n" +
+                "  similarity_function: COSINE\n" +
+                "  load_behavior: NO_SCRUB\n" +
+                "sub-ds:\n" +
+                "  similarity_function: COSINE\n" +
+                "  load_behavior: NO_SCRUB\n" +
+                "private-ds:\n" +
+                "  similarity_function: DOT_PRODUCT\n" +
+                "  load_behavior: NO_SCRUB\n");
+        testMetadata = DataSetMetadataReader.load(metadataFile.toString());
+    }
+
+    // ========================================================================
+    // Basic loading
+    // ========================================================================
+
+    @Test
+    public void loadsDatasetFromLocalCatalogAndFiles() throws IOException {
+        writeTestCatalog(cacheDir);
+        writeTestDataFiles(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        var info = loader.loadDataSet("test-ds");
+        assertTrue(info.isPresent(), "Dataset should be found in local catalog");
+        assertEquals("test-ds", info.get().getName());
+
+        var ds = info.get().getDataSet();
+        assertEquals(5, ds.getBaseVectors().size());
+        assertEquals(2, ds.getQueryVectors().size());
+        assertEquals(2, ds.getGroundTruth().size());
+        assertEquals(4, ds.getDimension());
+    }
+
+    @Test
+    public void returnsEmptyForUnknownDataset() throws IOException {
+        writeTestCatalog(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        assertFalse(loader.loadDataSet("nonexistent-dataset").isPresent());
+    }
+
+    @Test
+    public void failsWhenLocalFilesMissing() throws IOException {
+        writeTestCatalog(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        var ex = assertThrows(RuntimeException.class, () -> loader.loadDataSet("test-ds"));
+        assertTrue(ex.getCause().getMessage().contains("no remote URL configured"),
+                "Error should indicate no remote is available: " + ex.getCause().getMessage());
+    }
+
+    @Test
+    public void failsWhenNoCatalogAndRemoteUnreachable() {
+        assertThrows(RuntimeException.class, () -> new DataSetLoaderSimpleMFD(
+                "http://0.0.0.0:1/catalog_entries.yaml",
+                cacheDir.toString(), false, testMetadata
+        ));
+    }
+
+    @Test
+    public void checkForUpdatesDoesNotFailWhenRemoteUnreachable() throws IOException {
+        writeTestCatalog(cacheDir);
+        writeTestDataFiles(cacheDir);
+
+        // should not throw — logs a warning but proceeds with the local catalog
+        var loader = new DataSetLoaderSimpleMFD(
+                "http://0.0.0.0:1/catalog_entries.yaml",
+                cacheDir.toString(), true, testMetadata
+        );
+
+        assertTrue(loader.loadDataSet("test-ds").isPresent());
+    }
+
+    @Test
+    public void rejectsCatalogWithMissingFields() throws IOException {
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "bad-ds:\n  base: b.fvecs\n  query: q.fvecs\n");
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        assertFalse(loader.loadDataSet("bad-ds").isPresent(),
+                "Should return empty for dataset with missing catalog fields");
+    }
+
+    @Test
+    public void unknownFieldThrows() throws IOException {
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "test-ds:\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n" +
+                "  similarity: COSINE\n");
+
+        var ex = assertThrows(IllegalArgumentException.class, () ->
+                new DataSetLoaderSimpleMFD(null, cacheDir.toString(), false, testMetadata)
+        );
+        assertTrue(ex.getMessage().contains("similarity"),
+                "Error should name the unknown field: " + ex.getMessage());
+    }
+
+    @Test
+    public void unknownFieldInDefaultsThrows() throws IOException {
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "_defaults:\n" +
+                "  typo_field: some_value\n" +
+                "test-ds:\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+
+        var ex = assertThrows(IllegalArgumentException.class, () ->
+                new DataSetLoaderSimpleMFD(null, cacheDir.toString(), false, testMetadata)
+        );
+        assertTrue(ex.getMessage().contains("typo_field"),
+                "Error should name the unknown field: " + ex.getMessage());
+    }
+
+    // ========================================================================
+    // Local path resolution (file vs directory)
+    // ========================================================================
+
+    @Test
+    public void loadsWithLocalPathAsYamlFile() throws IOException {
+        writeTestCatalog(cacheDir);
+        writeTestDataFiles(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.resolve("catalog_entries.yaml").toString(),
+                false, testMetadata
+        );
+
+        var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet();
+        assertEquals(5, ds.getBaseVectors().size());
+    }
+
+    @Test
+    public void singleArgConstructorWithFilePath() throws IOException {
+        writeTestCatalog(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                cacheDir.resolve("catalog_entries.yaml").toString()
+        );
+
+        // can't call getDataSet() (no test-ds in production metadata), but catalog is loaded
+        assertFalse(loader.loadDataSet("nonexistent").isPresent());
+    }
+
+    @Test
+    public void singleArgConstructorWithDirectory() throws IOException {
+        writeTestCatalog(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(cacheDir.toString());
+        assertFalse(loader.loadDataSet("nonexistent").isPresent());
+    }
+
+    @Test
+    public void singleArgConstructorMissingCatalogReturnsEmpty() {
+        var loader = new DataSetLoaderSimpleMFD(cacheDir.toString());
+        assertFalse(loader.loadDataSet("test-ds").isPresent());
+    }
+
+    // ========================================================================
+    // Null / empty catalog URL (local-only mode)
+    // ========================================================================
+
+    @Test
+    public void nullCatalogUrlWorksWithLocalCatalog() throws IOException {
+        writeTestCatalog(cacheDir);
+        writeTestDataFiles(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet();
+        assertEquals(5, ds.getBaseVectors().size());
+    }
+
+    @Test
+    public void emptyCatalogUrlWorksWithLocalCatalog() throws IOException {
+        writeTestCatalog(cacheDir);
+        writeTestDataFiles(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                "", cacheDir.toString(), false, testMetadata
+        );
+
+        var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet();
+        assertEquals(5, ds.getBaseVectors().size());
+    }
+
+    @Test
+    public void nullCatalogUrlWithoutLocalCatalogReturnsEmpty() {
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+        assertFalse(loader.loadDataSet("test-ds").isPresent());
+    }
+
+    @Test
+    public void nullCatalogUrlIgnoresCheckForUpdates() throws IOException {
+        writeTestCatalog(cacheDir);
+        writeTestDataFiles(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), true, testMetadata
+        );
+        assertTrue(loader.loadDataSet("test-ds").isPresent());
+    }
+
+    // ========================================================================
+    // catalogUrl remote catalog loading
+    // ========================================================================
+
+    @Test
+    public void catalogUrlFetchesRemoteCatalogWhenNoLocalCatalogExists() throws IOException {
+        Path remoteDir = tempFolder.newFolder("remote-catalog").toPath();
+        Path remoteCacheDir = tempFolder.newFolder("remote-cache").toPath();
+
+        Files.writeString(remoteDir.resolve("catalog_entries.yaml"),
+                "_defaults:\n" +
+                        "  cache_dir: " + remoteCacheDir + "\n" +
+                        "test-ds:\n" +
+                        "  base: test_base.fvecs\n" +
+                        "  query: test_query.fvecs\n" +
+                        "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(remoteDir);
+
+        assertFalse(Files.exists(cacheDir.resolve("catalog_entries.yaml")),
+                "Precondition: local catalog should not exist");
+
+        HttpServer server = startFileServer(remoteDir);
+        try {
+            var loader = new DataSetLoaderSimpleMFD(
+                    urlFor(server, "catalog_entries.yaml"),
+                    cacheDir.toString(), false, testMetadata
+            );
+
+            // remote catalog should be cached locally
+            assertTrue(Files.exists(cacheDir.resolve("catalog_entries.yaml")));
+
+            var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet();
+            assertEquals(5, ds.getBaseVectors().size());
+            assertEquals(2, ds.getQueryVectors().size());
+            assertEquals(2, ds.getGroundTruth().size());
+            assertEquals(4, ds.getDimension());
+
+            // dataset files should be downloaded using the remote catalog's base path
+            assertTrue(Files.exists(remoteCacheDir.resolve("test_base.fvecs")));
+            assertTrue(Files.exists(remoteCacheDir.resolve("test_query.fvecs")));
+            assertTrue(Files.exists(remoteCacheDir.resolve("test_gt.ivecs")));
+        } finally {
+            server.stop(0);
+        }
+    }
+
+    @Test
+    public void catalogUrlDoesNotMergeRemoteCatalogWhenLocalCatalogExists() throws IOException {
+        // local catalog should win; remote catalog is only used for update checks in this mode
+        writeTestCatalog(cacheDir);
+        writeTestDataFiles(cacheDir);
+
+        Path remoteDir = tempFolder.newFolder("remote-catalog").toPath();
+        Files.writeString(remoteDir.resolve("catalog_entries.yaml"),
+                "sub-ds:\n" +
+                        "  base: test_base.fvecs\n" +
+                        "  query: test_query.fvecs\n" +
+                        "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(remoteDir);
+
+        HttpServer server = startFileServer(remoteDir);
+        try {
+            var loader = new DataSetLoaderSimpleMFD(
+                    urlFor(server, "catalog_entries.yaml"),
+                    cacheDir.toString(), false, testMetadata
+            );
+
+            assertTrue(loader.loadDataSet("test-ds").isPresent(),
+                    "Local catalog entry should be found");
+            assertFalse(loader.loadDataSet("sub-ds").isPresent(),
+                    "Remote catalog should not be merged when a local catalog exists");
+        } finally {
+            server.stop(0);
+        }
+    }
+
+    // ========================================================================
+    // Comment-only / empty catalog files
+    // ========================================================================
+
+    @Test
+    public void commentOnlyCatalogFileReturnsEmpty() throws IOException {
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "# This file has no actual entries\n# Just comments\n");
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+        assertFalse(loader.loadDataSet("test-ds").isPresent());
+    }
+
+    @Test
+    public void emptyCatalogFileReturnsEmpty() throws IOException {
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"), "");
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+        assertFalse(loader.loadDataSet("test-ds").isPresent());
+    }
+
+    @Test
+    public void singleArgWithCommentOnlyCatalogReturnsEmpty() throws IOException {
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "# placeholder for local datasets\n");
+
+        var loader = new DataSetLoaderSimpleMFD(
+                cacheDir.resolve("catalog_entries.yaml").toString()
+        );
+        assertFalse(loader.loadDataSet("anything").isPresent());
+    }
+
+    // ========================================================================
+    // Recursive catalog discovery
+    // ========================================================================
+
+    @Test
+    public void recursivelyDiscoversCatalogs() throws IOException {
+        // root catalog
+        writeTestCatalog(cacheDir);
+        writeTestDataFiles(cacheDir);
+
+        // subdirectory catalog with a different dataset
+        Path subDir = cacheDir.resolve("subgroup");
+        Files.createDirectories(subDir);
+        Files.writeString(subDir.resolve("catalog_entries.yaml"),
+                "sub-ds:\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(subDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        // both datasets should be found
+        assertTrue(loader.loadDataSet("test-ds").isPresent(), "Root catalog entry should be found");
+        assertTrue(loader.loadDataSet("sub-ds").isPresent(), "Subdirectory catalog entry should be found");
+    }
+
+    @Test
+    public void subdirectoryDataFilesResolveRelativeToTheirCatalog() throws IOException {
+        // root has no data files, subdirectory has both catalog and data
+        Path subDir = cacheDir.resolve("subgroup");
+        Files.createDirectories(subDir);
+        Files.writeString(subDir.resolve("catalog_entries.yaml"),
+                "sub-ds:\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(subDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        // data files should resolve relative to subDir, not cacheDir
+        var ds = loader.loadDataSet("sub-ds").orElseThrow().getDataSet();
+        assertEquals(5, ds.getBaseVectors().size());
+    }
+
+    @Test
+    public void duplicateEntryAcrossCatalogsDoesNotFail() throws IOException {
+        // root catalog defines test-ds
+        writeTestCatalog(cacheDir);
+        writeTestDataFiles(cacheDir);
+
+        // subdirectory also defines test-ds — one wins (walk order is unspecified)
+        Path subDir = cacheDir.resolve("override");
+        Files.createDirectories(subDir);
+        Files.writeString(subDir.resolve("catalog_entries.yaml"),
+                "test-ds:\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(subDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        // should load without error — whichever catalog wins, the dataset is valid
+        var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet();
+        assertNotNull(ds);
+        assertEquals(5, ds.getBaseVectors().size());
+    }
+
+    @Test
+    public void discoversAlternativeCatalogFilenames() throws IOException {
+        // entries.yaml in root
+        Files.writeString(cacheDir.resolve("entries.yaml"),
+                "test-ds:\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(cacheDir);
+
+        // private_entries.yaml in subdirectory
+        Path subDir = cacheDir.resolve("private");
+        Files.createDirectories(subDir);
+        Files.writeString(subDir.resolve("private_entries.yaml"),
+                "sub-ds:\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(subDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        assertTrue(loader.loadDataSet("test-ds").isPresent(), "root yaml should be discovered");
+        assertTrue(loader.loadDataSet("sub-ds").isPresent(), "subdirectory yaml should be discovered");
+    }
+
+    @Test
+    public void ignoresNonYamlFiles() throws IOException {
+        // .json and .txt files should not be picked up
+        Files.writeString(cacheDir.resolve("datasets.json"),
+                "{\"test-ds\": {\"base\": \"test_base.fvecs\"}}");
+        Files.writeString(cacheDir.resolve("readme.txt"),
+                "test-ds:\n  base: test_base.fvecs\n  query: test_query.fvecs\n  gt: test_gt.ivecs\n");
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        assertFalse(loader.loadDataSet("test-ds").isPresent(),
+                "Non-YAML files should be ignored");
+    }
+
+    @Test
+    public void anyYamlFileIsDiscovered() throws IOException {
+        // any .yaml file should be picked up, not just *entries.yaml
+        Files.writeString(cacheDir.resolve("my_datasets.yaml"),
+                "test-ds:\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        assertTrue(loader.loadDataSet("test-ds").isPresent(),
+                "Any .yaml file should be discovered");
+    }
+
+    @Test
+    public void ymlExtensionAlsoDiscovered() throws IOException {
+        Files.writeString(cacheDir.resolve("datasets.yml"),
+                "test-ds:\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        assertTrue(loader.loadDataSet("test-ds").isPresent(),
+                ".yml files should also be discovered");
+    }
+
+    // ========================================================================
+    // Per-entry base_url override
+    // ========================================================================
+
+    @Test
+    public void base_urlOverrideIsUsedForDownload() throws IOException {
+        // catalog entry has a base_url pointing to an unreachable server
+        // but the files exist locally — so base_url is not actually hit.
+        // This test verifies the entry is parsed correctly and local files still resolve.
+        Path subDir = cacheDir.resolve("private");
+        Files.createDirectories(subDir);
+        Files.writeString(subDir.resolve("catalog_entries.yaml"),
+                "private-ds:\n" +
+                "  base_url: http://0.0.0.0:1/secret-hash/\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(subDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        var ds = loader.loadDataSet("private-ds").orElseThrow().getDataSet();
+        assertEquals(5, ds.getBaseVectors().size());
+    }
+
+    @Test
+    public void base_urlOverrideFailsWhenLocalFilesMissingAndRemoteUnreachable() throws IOException {
+        // catalog entry has a base_url pointing to unreachable server, and no local data files
+        Path subDir = cacheDir.resolve("private");
+        Files.createDirectories(subDir);
+        Files.writeString(subDir.resolve("catalog_entries.yaml"),
+                "private-ds:\n" +
+                "  base_url: http://0.0.0.0:1/secret-hash/\n" +
+                "  base: missing_base.fvecs\n" +
+                "  query: missing_query.fvecs\n" +
+                "  gt: missing_gt.ivecs\n");
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        // should fail because files don't exist and base_url is unreachable
+        assertThrows(RuntimeException.class, () -> loader.loadDataSet("private-ds"));
+    }
+
+    @Test
+    public void base_urlWithoutTrailingSlashIsNormalized() throws IOException {
+        Path subDir = cacheDir.resolve("private");
+        Files.createDirectories(subDir);
+        // base_url without trailing slash
+        Files.writeString(subDir.resolve("catalog_entries.yaml"),
+                "private-ds:\n" +
+                "  base_url: http://0.0.0.0:1/no-trailing-slash\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(subDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        // should load fine — base_url is normalized with trailing slash
+        var ds = loader.loadDataSet("private-ds").orElseThrow().getDataSet();
+        assertEquals(5, ds.getBaseVectors().size());
+    }
+
+    @Test
+    public void subdirectoryPathsInFileValuesResolveCorrectly() throws IOException {
+        // mirrors the real large_dataset_entries.yaml structure where file values
+        // contain subdirectory paths like "dpr/c4-en_base_1M_norm.fvecs"
+        Path privateDir = cacheDir.resolve("jvector_private");
+        Files.createDirectories(privateDir);
+        Files.writeString(privateDir.resolve("large_dataset_entries.yaml"),
+                "test-ds:\n" +
+                "  base_url: http://0.0.0.0:1/secret-hash/\n" +
+                "  base: subdir/test_base.fvecs\n" +
+                "  query: subdir/test_query.fvecs\n" +
+                "  gt: subdir/test_gt.ivecs\n");
+
+        // create data files in the subdirectory under the catalog's directory
+        Path dataDir = privateDir.resolve("subdir");
+        Files.createDirectories(dataDir);
+        writeTestDataFiles(dataDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet();
+        assertEquals(5, ds.getBaseVectors().size());
+        assertEquals(4, ds.getDimension());
+    }
+
+    @Test
+    public void subdirectoryPathsDownloadWhenLocalMissing() throws IOException {
+        // catalog has subdirectory paths but files are missing locally and remote is unreachable
+        // — should fail with a clear error mentioning the base_url, not the default remote
+        Path privateDir = cacheDir.resolve("jvector_private");
+        Files.createDirectories(privateDir);
+        Files.writeString(privateDir.resolve("large_dataset_entries.yaml"),
+                "test-ds:\n" +
+                "  base_url: http://0.0.0.0:1/secret-hash/\n" +
+                "  base: subdir/missing_base.fvecs\n" +
+                "  query: subdir/missing_query.fvecs\n" +
+                "  gt: subdir/missing_gt.ivecs\n");
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        // should attempt to download from the base_url and fail
+        assertThrows(RuntimeException.class, () -> loader.loadDataSet("test-ds"));
+    }
+
+    // ========================================================================
+    // _defaults and _-prefix exclusion
+    // ========================================================================
+
+    @Test
+    public void defaultsAreFoldedIntoEntries() throws IOException {
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "_defaults:\n" +
+                "  base_url: http://0.0.0.0:1/default-path/\n" +
+                "test-ds:\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        // files exist locally so base_url isn't hit, but the entry should load fine
+        var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet();
+        assertEquals(5, ds.getBaseVectors().size());
+    }
+
+    @Test
+    public void entryOverridesDefaults() throws IOException {
+        // _defaults sets base_url, but the entry overrides it
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "_defaults:\n" +
+                "  base_url: http://0.0.0.0:1/should-be-overridden/\n" +
+                "test-ds:\n" +
+                "  base_url: http://0.0.0.0:2/entry-specific/\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet();
+        assertEquals(5, ds.getBaseVectors().size());
+    }
+
+    @Test
+    public void underscorePrefixedKeysAreExcluded() throws IOException {
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "_defaults:\n" +
+                "  base_url: http://0.0.0.0:1/x/\n" +
+                "_internal:\n" +
+                "  base: should_not_appear.fvecs\n" +
+                "test-ds:\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        assertFalse(loader.loadDataSet("_defaults").isPresent(), "_defaults should not be a dataset");
+        assertFalse(loader.loadDataSet("_internal").isPresent(), "_internal should not be a dataset");
+        assertTrue(loader.loadDataSet("test-ds").isPresent(), "test-ds should be found");
+    }
+
+    // ========================================================================
+    // cache_dir
+    // ========================================================================
+
+    @Test
+    public void cacheDirOverridesLocalDir() throws IOException {
+        // catalog is in cacheDir, but cache_dir points to a separate location
+        Path customCache = tempFolder.newFolder("custom-cache").toPath();
+        writeTestDataFiles(customCache);
+
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "test-ds:\n" +
+                "  cache_dir: " + customCache.toString() + "\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        // note: NO data files in cacheDir — they're in customCache
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet();
+        assertEquals(5, ds.getBaseVectors().size());
+    }
+
+    @Test
+    public void cacheDirFromDefaults() throws IOException {
+        Path customCache = tempFolder.newFolder("default-cache").toPath();
+        writeTestDataFiles(customCache);
+
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "_defaults:\n" +
+                "  cache_dir: " + customCache.toString() + "\n" +
+                "test-ds:\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet();
+        assertEquals(5, ds.getBaseVectors().size());
+    }
+
+    @Test
+    public void cacheDirEntryOverridesDefault() throws IOException {
+        Path defaultCache = tempFolder.newFolder("default-cache").toPath();
+        Path entryCache = tempFolder.newFolder("entry-cache").toPath();
+        writeTestDataFiles(entryCache);
+        // note: defaultCache has NO data files
+
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "_defaults:\n" +
+                "  cache_dir: " + defaultCache.toString() + "\n" +
+                "test-ds:\n" +
+                "  cache_dir: " + entryCache.toString() + "\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet();
+        assertEquals(5, ds.getBaseVectors().size());
+    }
+
+    @Test
+    public void nonExistentCacheDirIsAutoCreatedOnDownloadAttempt() throws IOException {
+        // cache_dir points to a directory that doesn't exist yet
+        Path newCacheDir = cacheDir.resolve("auto-created-subdir");
+        assertFalse(Files.exists(newCacheDir), "Precondition: dir should not exist yet");
+
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "test-ds:\n" +
+                "  cache_dir: " + newCacheDir + "\n" +
+                "  base_url: http://0.0.0.0:1/unreachable/\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        // download will fail (unreachable), but the directory should have been created
+        assertThrows(RuntimeException.class, () -> loader.loadDataSet("test-ds"));
+        assertTrue(Files.isDirectory(newCacheDir),
+                "cache_dir should be auto-created before download is attempted");
+    }
+
+    @Test
+    public void nonExistentCacheDirWithSubpathIsAutoCreated() throws IOException {
+        // cache_dir doesn't exist, and filenames contain subdirectories
+        Path newCacheDir = cacheDir.resolve("deep/nested/cache");
+        assertFalse(Files.exists(newCacheDir));
+
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "test-ds:\n" +
+                "  cache_dir: " + newCacheDir + "\n" +
+                "  base_url: http://0.0.0.0:1/unreachable/\n" +
+                "  base: subdir/test_base.fvecs\n" +
+                "  query: subdir/test_query.fvecs\n" +
+                "  gt: subdir/test_gt.ivecs\n");
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        assertThrows(RuntimeException.class, () -> loader.loadDataSet("test-ds"));
+        // both cache_dir and the subdir should be created
+        assertTrue(Files.isDirectory(newCacheDir.resolve("subdir")),
+                "cache_dir and subdirectory should be auto-created");
+    }
+
+    @Test
+    public void nonExistentCacheDirWithLocalFilesPrePopulated() throws IOException {
+        // cache_dir is auto-created, and files are placed there before loading
+        Path newCacheDir = cacheDir.resolve("fresh-cache");
+
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "test-ds:\n" +
+                "  cache_dir: " + newCacheDir + "\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+
+        // pre-create and populate — simulates a previous download
+        Files.createDirectories(newCacheDir);
+        writeTestDataFiles(newCacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet();
+        assertEquals(5, ds.getBaseVectors().size());
+    }
+
+    // ========================================================================
+    // ${VAR} expansion
+    // ========================================================================
+
+    @Test
+    public void envVarExpandedInBaseurl() throws IOException {
+        String envName = findReliableEnvVar();
+
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "test-ds:\n" +
+                "  base_url: http://0.0.0.0:1/${" + envName + "}/path/\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        // files exist locally so the expanded base_url isn't hit, but parsing should succeed
+        var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet();
+        assertEquals(5, ds.getBaseVectors().size());
+    }
+
+    @Test
+    public void envVarExpandedInCacheDir() throws IOException {
+        Path customCache = tempFolder.newFolder("env-cache").toPath();
+        writeTestDataFiles(customCache);
+
+        // verify that the ${} syntax is expanded without error
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "test-ds:\n" +
+                "  cache_dir: " + customCache.toString() + "\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet();
+        assertEquals(5, ds.getBaseVectors().size());
+    }
+
+    @Test
+    public void envVarExpandedInDefaults() throws IOException {
+        String envName = findReliableEnvVar();
+
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "_defaults:\n" +
+                "  base_url: http://0.0.0.0:1/${" + envName + "}/\n" +
+                "test-ds:\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet();
+        assertEquals(5, ds.getBaseVectors().size());
+    }
+
+    @Test
+    public void undefinedEnvVarThrows() throws IOException {
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "test-ds:\n" +
+                "  base_url: s3://bucket/${JVECTOR_NONEXISTENT_VAR_12345}/\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+
+        var ex = assertThrows(IllegalArgumentException.class, () ->
+                new DataSetLoaderSimpleMFD(null, cacheDir.toString(), false, testMetadata)
+        );
+        assertTrue(ex.getMessage().contains("JVECTOR_NONEXISTENT_VAR_12345"),
+                "Error should name the missing variable: " + ex.getMessage());
+    }
+
+    @Test
+    public void envVarWithDefaultUsesDefault() throws IOException {
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "test-ds:\n" +
+                "  base_url: http://0.0.0.0:1/${JVECTOR_NONEXISTENT_12345:-fallback-path}/\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet();
+        assertEquals(5, ds.getBaseVectors().size());
+    }
+
+    @Test
+    public void envVarWithDefaultPrefersEnvWhenSet() throws IOException {
+        String envName = findReliableEnvVar();
+
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "test-ds:\n" +
+                "  base_url: http://0.0.0.0:1/${" + envName + ":-not-used}/\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+        assertTrue(loader.loadDataSet("test-ds").isPresent());
+    }
+
+    @Test
+    public void envVarWithEmptyDefault() throws IOException {
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "test-ds:\n" +
+                "  base_url: http://0.0.0.0:1/${JVECTOR_NONEXISTENT_12345:-}/data/\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+        assertTrue(loader.loadDataSet("test-ds").isPresent());
+    }
+
+    @Test
+    public void multipleEnvVarsExpanded() throws IOException {
+        String envName = findReliableEnvVar();
+
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "test-ds:\n" +
+                "  base_url: http://0.0.0.0:1/${" + envName + "}/${" + envName + "}/\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+        assertTrue(loader.loadDataSet("test-ds").isPresent());
+    }
+
+    // ========================================================================
+    // _include directive
+    // ========================================================================
+
+    @Test
+    public void includeWithUnreachableRemoteWarnsButDoesNotFail() throws IOException {
+        // _include points to an unreachable URL — should log a warning, not crash
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "_include:\n" +
+                "  url: http://0.0.0.0:1/nonexistent/catalog_entries.yaml\n" +
+                "test-ds:\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        // local entry should still work
+        var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet();
+        assertEquals(5, ds.getBaseVectors().size());
+    }
+
+    @Test
+    public void includeWithUnreachableRemoteAndNoLocalEntriesReturnsEmpty() throws IOException {
+        // _include only, no local entries, remote unreachable — empty catalog, no crash
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "_include:\n" +
+                "  url: http://0.0.0.0:1/nonexistent/catalog_entries.yaml\n");
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        assertFalse(loader.loadDataSet("anything").isPresent());
+    }
+
+    @Test
+    public void includeWithMissingUrlFieldIsIgnored() throws IOException {
+        // _include exists but has no url field — should be silently ignored
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "_include:\n" +
+                "  description: this has no url\n" +
+                "test-ds:\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet();
+        assertEquals(5, ds.getBaseVectors().size());
+    }
+
+    @Test
+    public void localEntryOverridesIncludedEntry() throws IOException {
+        // simulate: _include would bring in "test-ds" from remote, but local also defines it.
+        // Since _include fails (unreachable), only the local entry exists. This tests that
+        // local entries in the same file are processed after _include and thus override.
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "_include:\n" +
+                "  url: http://0.0.0.0:1/remote/catalog_entries.yaml\n" +
+                "test-ds:\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        // local entry should work — the failed include shouldn't prevent it
+        var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet();
+        assertEquals(5, ds.getBaseVectors().size());
+    }
+
+    @Test
+    public void includeDefaultsAppliedToIncludedEntries() throws IOException {
+        // _defaults in the local file should be applied to entries from _include.
+        // Since we can't hit a real remote in unit tests, we verify indirectly:
+        // the _defaults + _include combo should not crash even with unreachable remote.
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "_defaults:\n" +
+                "  cache_dir: " + cacheDir.toString() + "\n" +
+                "_include:\n" +
+                "  url: http://0.0.0.0:1/remote/catalog_entries.yaml\n");
+
+        // should not throw
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        assertFalse(loader.loadDataSet("anything").isPresent());
+    }
+
+    @Test
+    public void includeWithEnvVarInUrl() throws IOException {
+        String envName = findReliableEnvVar();
+
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "_include:\n" +
+                "  url: http://0.0.0.0:1/${" + envName + "}/catalog_entries.yaml\n" +
+                "test-ds:\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(cacheDir);
+
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        assertTrue(loader.loadDataSet("test-ds").isPresent());
+    }
+
+    @Test
+    public void includeWithDefaultValueInUrl() throws IOException {
+        // ${NONEXISTENT:-fallback} in _include url
+        Files.writeString(cacheDir.resolve("catalog_entries.yaml"),
+                "_include:\n" +
+                "  url: http://0.0.0.0:1/${JVECTOR_NONEXISTENT_12345:-fallback}/catalog_entries.yaml\n" +
+                "test-ds:\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(cacheDir);
+
+        // should not throw — the default value is used
+        var loader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        assertTrue(loader.loadDataSet("test-ds").isPresent());
+    }
+
+    // ========================================================================
+    // Log redaction
+    // ========================================================================
+
+    @Test
+    public void redactReplacesLongHexHash() {
+        // SHA-256 hash (64 hex chars)
+        String url = "s3://bucket/6174752eb60168112f2edb38493782da2ebe5ae6bfc870e25ed1711205e5395d/dpr/file.fvecs";
+        String redacted = DataSetLoaderSimpleMFD.redact(url);
+        assertFalse(redacted.contains("6174752e"), "Hash should be redacted: " + redacted);
+        assertTrue(redacted.contains("[[redacted]]"), "Should contain redaction marker: " + redacted);
+        assertTrue(redacted.contains("s3://bucket/"), "Bucket should be preserved: " + redacted);
+        assertTrue(redacted.contains("/dpr/file.fvecs"), "Non-hash path should be preserved: " + redacted);
+    }
+
+    @Test
+    public void redactHandlesHashWithSuffix() {
+        // hash_private pattern
+        String url = "s3://bucket/6174752eb60168112f2edb38493782da2ebe5ae6bfc870e25ed1711205e5395d_private/data.fvecs";
+        String redacted = DataSetLoaderSimpleMFD.redact(url);
+        assertFalse(redacted.contains("6174752e"), "Hash should be redacted: " + redacted);
+        assertTrue(redacted.contains("[[redacted]]"));
+    }
+
+    @Test
+    public void redactHandlesHashWithDashes() {
+        // UUID-style or dashed key: a3f8b2c1-d4e5-f6a7-b8c9-d0e1f2a3b4c5
+        String url = "s3://bucket/a3f8b2c1-d4e5-f6a7-b8c9-d0e1f2a3b4c5/data.fvecs";
+        String redacted = DataSetLoaderSimpleMFD.redact(url);
+        assertFalse(redacted.contains("a3f8b2c1"), "Dashed hex should be redacted: " + redacted);
+        assertTrue(redacted.contains("[[redacted]]"));
+    }
+
+    @Test
+    public void redactHandlesHashWithDots() {
+        // dotted hex token
+        String url = "https://host/a3f8b2c1.d4e5f6a7.b8c9d0e1.f2a3b4c5/data.fvecs";
+        String redacted = DataSetLoaderSimpleMFD.redact(url);
+        assertFalse(redacted.contains("a3f8b2c1"), "Dotted hex should be redacted: " + redacted);
+    }
+
+    @Test
+    public void redactHandles0xPrefix() {
+        String url = "s3://bucket/0xa3f8b2c1d4e5f6a7b8c9d0e1f2a3b4c5/data.fvecs";
+        String redacted = DataSetLoaderSimpleMFD.redact(url);
+        assertFalse(redacted.contains("a3f8b2c1"), "0x-prefixed hex should be redacted: " + redacted);
+        assertTrue(redacted.contains("[[redacted]]"));
+    }
+
+    @Test
+    public void redactPreservesNonHashPaths() {
+        String url = "s3://jvector-datasets-public/datasets-clean/ada_002_100k_base.fvecs";
+        assertEquals(url, DataSetLoaderSimpleMFD.redact(url), "No hash segments — should be unchanged");
+    }
+
+    @Test
+    public void redactPreservesDatasetNames() {
+        // these have some hex-like chars but are clearly not secrets
+        assertEquals("/data/e5-base-v2-100k/file.fvecs",
+                DataSetLoaderSimpleMFD.redact("/data/e5-base-v2-100k/file.fvecs"));
+        assertEquals("/data/ada002-100k/file.fvecs",
+                DataSetLoaderSimpleMFD.redact("/data/ada002-100k/file.fvecs"));
+        assertEquals("s3://bucket/cohere-english-v3-1M/file.fvecs",
+                DataSetLoaderSimpleMFD.redact("s3://bucket/cohere-english-v3-1M/file.fvecs"));
+    }
+
+    @Test
+    public void redactPreservesShortHexSegments() {
+        String path = "/data/a3f8b2/file.fvecs";
+        assertEquals(path, DataSetLoaderSimpleMFD.redact(path));
+    }
+
+    @Test
+    public void redactHandlesNull() {
+        assertEquals("null", DataSetLoaderSimpleMFD.redact(null));
+    }
+
+    @Test
+    public void redactHandlesMultipleSecretSegments() {
+        String url = "s3://bucket/aaaa1111bbbb2222cccc3333dddd4444eeee5555ffff6666/sub/1111222233334444555566667777888899990000aaaabbbb/file.fvecs";
+        String redacted = DataSetLoaderSimpleMFD.redact(url);
+        assertFalse(redacted.contains("aaaa1111"), "First hash should be redacted");
+        assertFalse(redacted.contains("11112222"), "Second hash should be redacted");
+        assertTrue(redacted.contains("/sub/"), "Non-hash path preserved");
+    }
+
+    @Test
+    public void redactHandlesWindowsPaths() {
+        String path = "C:\\data\\6174752eb60168112f2edb38493782da2ebe5ae6bfc870e25ed1711205e5395d\\file.fvecs";
+        String redacted = DataSetLoaderSimpleMFD.redact(path);
+        assertFalse(redacted.contains("6174752e"), "Hash should be redacted in Windows paths: " + redacted);
+    }
+
+    @Test
+    public void redactHandlesMixedSeparatorsInSecret() {
+        // underscores and dashes mixed with hex — still predominantly hex
+        String url = "s3://bucket/a1b2c3d4_e5f6a7b8-c9d0e1f2_a3b4c5d6/data.fvecs";
+        String redacted = DataSetLoaderSimpleMFD.redact(url);
+        assertFalse(redacted.contains("a1b2c3d4"), "Mixed-separator hex should be redacted: " + redacted);
+    }
+
+    // ========================================================================
+    // _include cached remote catalogs
+    // ========================================================================
+
+    @Test
+    public void includeOnlyCatalogLoadsOfflineFromCachedRemoteCatalog() throws IOException {
+        // wrapper catalog points to a remote catalog and caches data files locally
+        Path remoteDir = tempFolder.newFolder("remote-catalog").toPath();
+        Path cachedDataDir = tempFolder.newFolder("cached-public-data").toPath();
+
+        Files.writeString(remoteDir.resolve("catalog_entries.yaml"),
+                "test-ds:\n" +
+                        "  base: test_base.fvecs\n" +
+                        "  query: test_query.fvecs\n" +
+                        "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(remoteDir);
+
+        HttpServer server = startFileServer(remoteDir);
+        try {
+            Files.writeString(cacheDir.resolve("public-catalog.yaml"),
+                    "_include:\n" +
+                            "  url: " + urlFor(server, "catalog_entries.yaml") + "\n" +
+                            "_defaults:\n" +
+                            "  cache_dir: " + cachedDataDir + "\n");
+
+            // first run online: include fetch succeeds and data files are cached locally
+            var onlineLoader = new DataSetLoaderSimpleMFD(
+                    null, cacheDir.toString(), false, testMetadata
+            );
+
+            var onlineDs = onlineLoader.loadDataSet("test-ds").orElseThrow().getDataSet();
+            assertEquals(5, onlineDs.getBaseVectors().size());
+            assertTrue(Files.exists(cachedDataDir.resolve("test_base.fvecs")));
+            assertTrue(Files.exists(cachedDataDir.resolve("test_query.fvecs")));
+            assertTrue(Files.exists(cachedDataDir.resolve("test_gt.ivecs")));
+        } finally {
+            server.stop(0);
+        }
+
+        // second run offline: include fetch fails, but the cached remote catalog still
+        // provides the dataset entry so the cached data files can be loaded
+        var offlineLoader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        var offlineDs = offlineLoader.loadDataSet("test-ds").orElseThrow().getDataSet();
+        assertEquals(5, offlineDs.getBaseVectors().size());
+        assertEquals(2, offlineDs.getQueryVectors().size());
+        assertEquals(2, offlineDs.getGroundTruth().size());
+        assertEquals(4, offlineDs.getDimension());
+    }
+
+    @Test
+    public void localCatalogOverridesCachedIncludedRemoteCatalogOffline() throws IOException {
+        // local dataset should win over a cached included remote dataset of the same name
+        Path remoteDir = tempFolder.newFolder("remote-catalog").toPath();
+        Path cachedRemoteDir = tempFolder.newFolder("cached-public-data").toPath();
+        Path localOverrideDir = tempFolder.newFolder("local-override").toPath();
+
+        Files.writeString(remoteDir.resolve("catalog_entries.yaml"),
+                "test-ds:\n" +
+                        "  base: test_base.fvecs\n" +
+                        "  query: test_query.fvecs\n" +
+                        "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(remoteDir);
+
+        Files.writeString(cacheDir.resolve("local-override.yaml"),
+                "test-ds:\n" +
+                        "  cache_dir: " + localOverrideDir + "\n" +
+                        "  base: test_base.fvecs\n" +
+                        "  query: test_query.fvecs\n" +
+                        "  gt: test_gt.ivecs\n");
+        writeLocalOverrideDataFiles(localOverrideDir);
+
+        HttpServer server = startFileServer(remoteDir);
+        try {
+            Files.writeString(cacheDir.resolve("public-catalog.yaml"),
+                    "_include:\n" +
+                            "  url: " + urlFor(server, "catalog_entries.yaml") + "\n" +
+                            "_defaults:\n" +
+                            "  cache_dir: " + cachedRemoteDir + "\n");
+
+            // online construction fetches and caches the included remote catalog,
+            // but the local override should still win
+            var onlineLoader = new DataSetLoaderSimpleMFD(
+                    null, cacheDir.toString(), false, testMetadata
+            );
+
+            var onlineDs = onlineLoader.loadDataSet("test-ds").orElseThrow().getDataSet();
+            assertEquals(1, onlineDs.getBaseVectors().size());
+        } finally {
+            server.stop(0);
+        }
+
+        // offline, the cached remote catalog should still not override the real local dataset
+        var offlineLoader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        var offlineDs = offlineLoader.loadDataSet("test-ds").orElseThrow().getDataSet();
+        assertEquals(1, offlineDs.getBaseVectors().size());
+        assertEquals(1, offlineDs.getQueryVectors().size());
+        assertEquals(1, offlineDs.getGroundTruth().size());
+        assertEquals(4, offlineDs.getDimension());
+    }
+
+    @Test
+    public void cachedIncludedRemoteCatalogStillFailsOfflineWhenDataFilesAreMissing() throws IOException {
+        // a cached remote catalog should not mask missing data files
+        Path remoteDir = tempFolder.newFolder("remote-catalog").toPath();
+        Path cachedDataDir = tempFolder.newFolder("cached-public-data").toPath();
+
+        Files.writeString(remoteDir.resolve("catalog_entries.yaml"),
+                "test-ds:\n" +
+                        "  base: test_base.fvecs\n" +
+                        "  query: test_query.fvecs\n" +
+                        "  gt: test_gt.ivecs\n");
+        writeTestDataFiles(remoteDir);
+
+        HttpServer server = startFileServer(remoteDir);
+        try {
+            Files.writeString(cacheDir.resolve("public-catalog.yaml"),
+                    "_include:\n" +
+                            "  url: " + urlFor(server, "catalog_entries.yaml") + "\n" +
+                            "_defaults:\n" +
+                            "  cache_dir: " + cachedDataDir + "\n");
+
+            // construct once online so the included remote catalog is cached locally,
+            // but do not load the dataset, so the data files are not downloaded
+            new DataSetLoaderSimpleMFD(
+                    null, cacheDir.toString(), false, testMetadata
+            );
+        } finally {
+            server.stop(0);
+        }
+
+        assertFalse(Files.exists(cachedDataDir.resolve("test_base.fvecs")),
+                "Precondition: dataset files should not have been downloaded");
+
+        var offlineLoader = new DataSetLoaderSimpleMFD(
+                null, cacheDir.toString(), false, testMetadata
+        );
+
+        assertThrows(RuntimeException.class, () -> offlineLoader.loadDataSet("test-ds"),
+                "Cached remote catalog should still fail when the chosen data files are missing offline");
+    }
+
+    // ========================================================================
+    // Helpers
+    // ========================================================================
+
+    /// Starts a simple static HTTP file server rooted at the given directory.
+    private static HttpServer startFileServer(Path root) throws IOException {
+        HttpServer server = HttpServer.create(new InetSocketAddress("127.0.0.1", 0), 0);
+        server.createContext("/", exchange -> serveStaticFile(exchange, root));
+        server.start();
+        return server;
+    }
+
+    /// Returns the full URL for a file served by the test HTTP server.
+    private static String urlFor(HttpServer server, String filename) {
+        return "http://127.0.0.1:" + server.getAddress().getPort() + "/" + filename;
+    }
+
+    /// Serves a file from the given root directory, or 404 if it does not exist.
+    private static void serveStaticFile(HttpExchange exchange, Path root) throws IOException {
+        String requestPath = exchange.getRequestURI().getPath();
+        String relativePath = requestPath.startsWith("/") ? requestPath.substring(1) : requestPath;
+        Path file = root.resolve(relativePath).normalize();
+
+        if (!file.startsWith(root) || !Files.isRegularFile(file)) {
+            exchange.sendResponseHeaders(404, -1);
+            exchange.close();
+            return;
+        }
+
+        byte[] bytes = Files.readAllBytes(file);
+        exchange.sendResponseHeaders(200, bytes.length);
+        try (OutputStream output = exchange.getResponseBody()) {
+            output.write(bytes);
+        }
+    }
+
+    /// Writes a small local-override dataset so tests can distinguish it from the remote copy.
+    private static void writeLocalOverrideDataFiles(Path dir) throws IOException {
+        writeTestFvecs(dir.resolve("test_base.fvecs"), 4, new float[][] {
+                {1.0f, 1.0f, 0.0f, 0.0f},
+        });
+        writeTestFvecs(dir.resolve("test_query.fvecs"), 4, new float[][] {
+                {1.0f, 1.0f, 0.0f, 0.0f},
+        });
+        writeTestIvecs(dir.resolve("test_gt.ivecs"), new int[][] {
+                {0},
+        });
+    }
+
+    private static void writeTestCatalog(Path dir) throws IOException {
+        Files.writeString(dir.resolve("catalog_entries.yaml"),
+                "test-ds:\n" +
+                "  base: test_base.fvecs\n" +
+                "  query: test_query.fvecs\n" +
+                "  gt: test_gt.ivecs\n");
+    }
+
+    private static void writeTestDataFiles(Path dir) throws IOException {
+        writeTestFvecs(dir.resolve("test_base.fvecs"), 4, new float[][] {
+                {1.0f, 0.0f, 0.0f, 0.0f},
+                {0.0f, 1.0f, 0.0f, 0.0f},
+                {0.0f, 0.0f, 1.0f, 0.0f},
+                {0.0f, 0.0f, 0.0f, 1.0f},
+                {0.5f, 0.5f, 0.5f, 0.5f},
+        });
+        writeTestFvecs(dir.resolve("test_query.fvecs"), 4, new float[][] {
+                {1.0f, 0.0f, 0.0f, 0.0f},
+                {0.0f, 0.0f, 1.0f, 0.0f},
+        });
+        writeTestIvecs(dir.resolve("test_gt.ivecs"), new int[][] {
+                {0, 4, 1, 2, 3},
+                {2, 4, 0, 1, 3},
+        });
+    }
+
+    /// Writes vectors in the standard fvecs format.
+    private static void writeTestFvecs(Path path, int dimension, float[][] vectors) throws IOException {
+        int bytesPerVector = Integer.BYTES + dimension * Float.BYTES;
+        var buf = ByteBuffer.allocate(vectors.length * bytesPerVector).order(ByteOrder.LITTLE_ENDIAN);
+        for (float[] vec : vectors) {
+            buf.putInt(dimension);
+            for (float v : vec) buf.putFloat(v);
+        }
+        Files.write(path, buf.array());
+    }
+
+    /// Writes ground truth in the standard ivecs format.
+    private static void writeTestIvecs(Path path, int[][] entries) throws IOException {
+        int totalBytes = 0;
+        for (int[] entry : entries) totalBytes += Integer.BYTES + entry.length * Integer.BYTES;
+        var buf = ByteBuffer.allocate(totalBytes).order(ByteOrder.LITTLE_ENDIAN);
+        for (int[] entry : entries) {
+            buf.putInt(entry.length);
+            for (int v : entry) buf.putInt(v);
+        }
+        Files.write(path, buf.array());
+    }
+}
diff --git a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetPropertiesTest.java b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetPropertiesTest.java
index 8ed60e5bb..13f2136aa 100644
--- a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetPropertiesTest.java
+++ b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetPropertiesTest.java
@@ -380,7 +380,7 @@ public void dataSetInfoLazyLoading() {
 
     @Test
     public void productionMetadataFileLoadsSuccessfully() {
-        // This validates the actual dataset_metadata.yml is well-formed
+        // This validates the actual dataset-metadata.yml is well-formed
         var reader = DataSetMetadataReader.load();
         var props = reader.getProperties("ada002-100k");
         assertTrue(props.isPresent(), "ada002-100k should be in the production metadata file");
@@ -391,7 +391,7 @@ public void productionMetadataFileLoadsSuccessfully() {
     public void productionMetadataAllEntriesHaveSimilarityFunction() {
         var reader = DataSetMetadataReader.load();
         // All entries in the production metadata should have a similarity function
-        for (var name : new String[]{"cohere-english-v3-100k", "ada002-100k", "openai-v3-small-100k",
+        for (var name : new String[]{"cohere-english-v3-100k", "ada002-100k", "openai-v3-small-1536-100k",
                 "gecko-100k", "openai-v3-large-3072-100k", "openai-v3-large-1536-100k",
                 "e5-small-v2-100k", "e5-base-v2-100k", "e5-large-v2-100k",
                 "ada002-1M", "colbert-1M"}) {
diff --git a/jvector-examples/yaml-configs/dataset-catalogs/datasets.md b/jvector-examples/yaml-configs/dataset-catalogs/datasets.md
new file mode 100644
index 000000000..ecfd0d9cf
--- /dev/null
+++ b/jvector-examples/yaml-configs/dataset-catalogs/datasets.md
@@ -0,0 +1,148 @@
+# Hosting Datasets
+
+You can host and distribute your datasets remotely as long as they are available 
+via HTTPS or S3. This guide tells you how to do this.
+
+## Directory layout
+
+All `.yaml` and `.yml` files under this directory tree are discovered automatically at startup.
+This is configured in DataSets.java as a loader parameter.
+
+```
+jvector-examples/
+  yaml-configs/
+    dataset-catalogs/
+        public-catalog.yaml      # _includes the public S3 catalog
+        local-catalog.yaml       # reference/template with all options documented
+        my-team-datasets.yaml    # name your own for your own datasets
+```
+
+## Quick start
+
+### Using public datasets
+
+Public datasets work out of the box. `public-catalog.yaml` uses
+`_include` to pull the dataset catalog from S3, and files are downloaded on first use:
+
+```sh
+# see what's available
+curl -L https://jvector-datasets-public.s3.us-east-1.amazonaws.com/datasets-clean/catalog_entries.yaml
+```
+
+Downloaded files are cached locally in `dataset_cache/public/` by default.
+Set the `DATASET_CACHE_DIR` environment variable to change this location.
+
+### Adding your own local datasets
+
+1. Create a `.yaml` file anywhere under this directory (e.g. `custom-catalog.yaml`).
+2. Map each dataset name to its three files:
+
+```yaml
+my-dataset:
+  base: /path/to/base_vectors.fvecs
+  query: /path/to/query_vectors.fvecs
+  gt: /path/to/ground_truth.ivecs
+```
+
+3. Add the appropriate settings to these files as well, so BenchYAML can use the datasets.
+   - `jvector-examples/yaml-configs/dataset-metadata.yml`:
+   - `jvector-examples/yaml-configs/datasets.yml`
+
+### Hosting remote datasets
+
+You can host datasets on any S3 bucket or HTTPS server. Each dataset needs three files
+in fvecs/ivecs format (base vectors, query vectors, ground truth indices).
+
+**Option A: Use `_include` to reference a remote catalog**
+
+Create a thin local YAML that pulls entries from a remote `catalog_entries.yaml`:
+
+```yaml
+_defaults:
+  cache_dir: ${DATASET_CACHE_DIR:-dataset_cache}/my-remote
+
+_include:
+  url: s3://my-bucket/datasets/catalog_entries.yaml
+```
+
+The remote catalog lists dataset entries in the same format. Its base path (the directory
+containing the catalog file) is used as the default `base_url` for all included entries.
+
+**Option B: Use `base_url` per entry or in `_defaults`**
+
+```yaml
+_defaults:
+  base_url: s3://my-bucket/datasets/
+  cache_dir: ${DATASET_CACHE_DIR:-dataset_cache}/my-remote
+
+ada002-100k:
+  base: ada_002_100k_base.fvecs
+  query: ada_002_100k_query.fvecs
+  gt: ada_002_100k_gt.ivecs
+```
+
+File paths are appended to `base_url` for downloading. Files in subdirectories work too
+(e.g. `base: subdir/file.fvecs` downloads from `s3://my-bucket/datasets/subdir/file.fvecs`).
+
+### Private datasets with secret paths
+
+Use `${VAR}` env var expansion to keep secrets out of committed files:
+
+```yaml
+_defaults:
+  base_url: s3://my-bucket/${DATASET_SECRET_HASH}/
+  cache_dir: ${DATASET_CACHE_DIR:-dataset_cache}/private
+
+dpr-1M:
+  base: dpr/base.fvecs
+  query: dpr/query.fvecs
+  gt: dpr/gt.ivecs
+```
+
+Set `DATASET_SECRET_HASH` in your environment. The `${VAR:-default}` syntax provides a
+fallback value when the variable is not set.
+
+## Catalog file reference
+
+### Required fields (per dataset entry)
+
+| Field   | Description |
+|---------|-------------|
+| `base`  | Path to base vectors file (`.fvecs`) |
+| `query` | Path to query vectors file (`.fvecs`) |
+| `gt`    | Path to ground truth indices file (`.ivecs`) |
+
+### Optional fields
+
+| Field       | Description |
+|-------------|-------------|
+| `base_url`  | Remote URL (S3 or HTTPS) to download files from when not cached locally |
+| `cache_dir` | Local directory for cached files (relative or absolute path) |
+
+### Special entries
+
+| Key          | Description |
+|--------------|-------------|
+| `_defaults`  | Default values folded into all dataset entries in the same file. Entry-level values take precedence. |
+| `_include`   | Contains a `url` field pointing to a remote catalog. Remote entries are fetched and merged with local `_defaults`. |
+| `_*`         | Any root key starting with `_` is excluded from dataset names. |
+
+### Environment variables
+
+- Field values support `${VAR}` and `${VAR:-default}` syntax (bash-style).
+- `${VAR}` expands to the environment variable value; throws an error if not set.
+- `${VAR:-default}` uses the default when the variable is not set (including `${VAR:-}` for empty string).
+- The `DATASET_CACHE_DIR` environment variable sets a global default `cache_dir` when none is specified at the entry or `_defaults` level.
+
+### Cache directory resolution order
+
+1. `cache_dir` on the dataset entry
+2. `cache_dir` in `_defaults`
+3. `DATASET_CACHE_DIR` environment variable
+4. `dataset_cache/` under the repository root
+
+### Supported transport protocols
+
+- **S3** (`s3://bucket/path`) -- uses the AWS SDK with anonymous credentials
+- **HTTPS** (`https://host/path`) -- uses Java's built-in HTTP client
+- **Local files** -- no download; files are read directly from the resolved path
diff --git a/jvector-examples/yaml-configs/dataset-catalogs/local-catalog.yaml b/jvector-examples/yaml-configs/dataset-catalogs/local-catalog.yaml
new file mode 100644
index 000000000..0aee43b9a
--- /dev/null
+++ b/jvector-examples/yaml-configs/dataset-catalogs/local-catalog.yaml
@@ -0,0 +1,41 @@
+# This file maps dataset names to their data files (base, query, ground truth).
+# Any .yaml or .yml file under this directory tree is discovered automatically.
+#
+# Required fields per dataset:
+#   base  - path to base vectors file (.fvecs)
+#   query - path to query vectors file (.fvecs)
+#   gt    - path to ground truth indices file (.ivecs)
+#
+# Optional fields:
+#   base_url   - remote URL (S3 or HTTP) to fetch files from when not cached locally
+#   cache_dir - local directory for cached files (relative or absolute path)
+#
+# Special entries:
+#   _defaults - provides default values folded into all other entries in this file.
+#               Entry-level values take precedence over defaults.
+#   Any root key starting with _ is excluded from dataset names.
+#
+# Environment variables:
+#   Field values support ${VAR} and ${VAR:-default} syntax (bash-style).
+#   ${VAR} expands to the environment variable value (error if not set).
+#   ${VAR:-default} uses the default value when the variable is not set.
+#   The DATASET_CACHE_DIR environment variable sets a global default cache_dir
+#   when none is specified at the entry or _defaults level.
+#
+# Example:
+#
+# _defaults:
+#   base_url: s3://my-bucket/${DATASET_HASH:-datasets}/
+#   cache_dir: ${DATASET_CACHE_DIR:-/tmp/dataset-cache}
+#
+# my_local_data:
+#   base: path_to_base_vectors.fvecs
+#   query: path_to_query_vectors.fvecs
+#   gt: path_to_ground_truth_indices.ivecs
+#
+# my_remote_data:
+#   base_url: s3://my-bucket/${SECRET_HASH}/
+#   cache_dir: /fast-ssd/private
+#   base: private_base.fvecs
+#   query: private_query.fvecs
+#   gt: private_gt.ivecs
diff --git a/jvector-examples/yaml-configs/dataset-catalogs/public-catalog.yaml b/jvector-examples/yaml-configs/dataset-catalogs/public-catalog.yaml
new file mode 100644
index 000000000..cf0188de3
--- /dev/null
+++ b/jvector-examples/yaml-configs/dataset-catalogs/public-catalog.yaml
@@ -0,0 +1,14 @@
+# Please do not modify this file.
+#
+# Hint: you can see what's available in our public-catalog by running
+# curl -L https://jvector-datasets-public.s3.us-east-1.amazonaws.com/datasets-clean/catalog_entries.yaml
+
+_include:
+  url: s3://jvector-datasets-public/datasets-clean/catalog_entries.yaml
+
+# This sets the local cache directory for the datasets.
+# If it is not set, the dataset will be cached under dataset_cache/
+_defaults:
+  cache_dir: ${DATASET_CACHE_DIR:-dataset_cache}/public
+#  cache_dir: ${HOME}/...
+#  cache_dir: /absolute/path/...
diff --git a/jvector-examples/yaml-configs/dataset-catalogs/sharing.md b/jvector-examples/yaml-configs/dataset-catalogs/sharing.md
new file mode 100644
index 000000000..288e319f5
--- /dev/null
+++ b/jvector-examples/yaml-configs/dataset-catalogs/sharing.md
@@ -0,0 +1,33 @@
+# Sharing a Dataset
+
+To share a remotely hosted dataset:
+
+1. **Prepare your files** -- you need three files in fvecs/ivecs format:
+   - `base_vectors.fvecs` -- the vectors to index
+   - `query_vectors.fvecs` -- the vectors to search with
+   - `ground_truth.ivecs` -- the known nearest neighbor indices for each query
+
+2. **Upload them** to an S3 bucket or HTTPS-accessible location.
+
+3. **Create a catalog file** (any `.yaml` file) listing the dataset:
+   ```yaml
+   _defaults:
+     base_url: https://my-server.com/datasets/
+
+   my-dataset:
+     base: my_base_vectors.fvecs
+     query: my_query_vectors.fvecs
+     gt: my_ground_truth.ivecs
+   ```
+
+4. **Distribute the catalog file.** Recipients drop it into
+   `jvector-examples/yaml-configs/dataset-catalogs/` and the loader picks it up automatically.
+   Remote files are downloaded on first use. Downloaded files are cached locally.
+
+For private datasets, use `${VAR}` in the `base_url` to keep secret paths out of the file:
+```yaml
+_defaults:
+  base_url: s3://my-bucket/${SECRET_HASH}/
+```
+
+See [datasets.md](datasets.md) for the full configuration reference.
diff --git a/jvector-examples/yaml-configs/dataset-metadata.yml b/jvector-examples/yaml-configs/dataset-metadata.yml
new file mode 100644
index 000000000..fe48746d1
--- /dev/null
+++ b/jvector-examples/yaml-configs/dataset-metadata.yml
@@ -0,0 +1,99 @@
+# This file contains authoritative metadata for curated benchmark datasets whose
+# raw formats do not carry the properties we need at runtime.
+#
+# The loaders use this file to determine dataset properties, such
+# as similarity_function and load_behavior.
+#
+# load_behavior controls benchmark-loader processing:
+#   LEGACY_SCRUB - Use (the soon-to-be deprecated) load-time scrubbing behavior
+#   NO_SCRUB     - load vectors and ground truth exactly as stored
+#
+# In JVector 4.0.0-rc.8 and earlier, datasets were scrubbed at load-time using methods
+# that were found to have problems.  These are enabled using LEGACY_SCRUB, which should
+# only be used to reproduce historical run results.
+#
+# Additional metadata requires corresponding support in DataSetProperties and the
+# relevant loader code.
+
+ada002-100k:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+ada002-1M:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+cap-1M:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+cap-6M:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+cohere-english-v3-100k:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+cohere-english-v3-1M:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+cohere-english-v3-10M:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+colbert-1M:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+colbert-10M:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+degen-200k:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+dpr-1M:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+dpr-10M:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+e5-small-v2-100k:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+e5-base-v2-100k:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+e5-large-v2-100k:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+gecko-100k:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+nv-qa-v4-100k:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+openai-v3-small-1536-100k:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+openai-v3-large-3072-100k:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+openai-v3-large-1536-100k:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+# ann-benchmarks
+glove-25-angular:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+glove-50-angular:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+lastfm-64-dot:
+  similarity_function: DOT_PRODUCT
+  load_behavior: NO_SCRUB
+glove-100-angular:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+glove-200-angular:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+nytimes-256-angular:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+sift-128-euclidean:
+  similarity_function: EUCLIDEAN
+  load_behavior: NO_SCRUB
\ No newline at end of file
diff --git a/jvector-examples/yaml-configs/dataset_metadata.yml b/jvector-examples/yaml-configs/dataset_metadata.yml
deleted file mode 100644
index 21e5e69f9..000000000
--- a/jvector-examples/yaml-configs/dataset_metadata.yml
+++ /dev/null
@@ -1,99 +0,0 @@
-# This file contains authoritative metadata for curated benchmark datasets whose
-# raw formats do not carry the properties we need at runtime.
-#
-# Both the MFD and HDF5 loaders use this file to determine dataset properties such
-# as similarity_function and load_behavior.
-#
-# load_behavior controls benchmark-loader processing:
-#   LEGACY_SCRUB - preserve the current load-time scrubbing behavior
-#   NO_SCRUB     - load vectors and ground truth exactly as stored
-#
-# During the transition, existing deployed datasets should generally remain on
-# LEGACY_SCRUB until their prescrubbed replacements and matching offline ground
-# truth are ready. New prescrubbed datasets should use NO_SCRUB.
-#
-# Additional metadata requires corresponding support in DataSetProperties and the
-# relevant loader code.
-
-ada002-100k:
-  similarity_function: COSINE
-  load_behavior: LEGACY_SCRUB
-ada002-1M:
-  similarity_function: COSINE
-  load_behavior: LEGACY_SCRUB
-cap-1M:
-  similarity_function: DOT_PRODUCT
-  load_behavior: LEGACY_SCRUB
-cap-6M:
-  similarity_function: DOT_PRODUCT
-  load_behavior: LEGACY_SCRUB
-cohere-english-v3-100k:
-  similarity_function: COSINE
-  load_behavior: LEGACY_SCRUB
-cohere-english-v3-1M:
-  similarity_function: COSINE
-  load_behavior: LEGACY_SCRUB
-cohere-english-v3-10M:
-  similarity_function: COSINE
-  load_behavior: LEGACY_SCRUB
-colbert-1M:
-  similarity_function: COSINE
-  load_behavior: LEGACY_SCRUB
-colbert-10M:
-  similarity_function: COSINE
-  load_behavior: LEGACY_SCRUB
-degen-200k:
-  similarity_function: COSINE
-  load_behavior: LEGACY_SCRUB
-dpr-1M:
-  similarity_function: DOT_PRODUCT
-  load_behavior: LEGACY_SCRUB
-dpr-10M:
-  similarity_function: DOT_PRODUCT
-  load_behavior: LEGACY_SCRUB
-e5-small-v2-100k:
-  similarity_function: COSINE
-  load_behavior: LEGACY_SCRUB
-e5-base-v2-100k:
-  similarity_function: COSINE
-  load_behavior: LEGACY_SCRUB
-e5-large-v2-100k:
-  similarity_function: COSINE
-  load_behavior: LEGACY_SCRUB
-gecko-100k:
-  similarity_function: COSINE
-  load_behavior: LEGACY_SCRUB
-nv-qa-v4-100k:
-  similarity_function: COSINE
-  load_behavior: LEGACY_SCRUB
-openai-v3-small-100k:
-  similarity_function: COSINE
-  load_behavior: LEGACY_SCRUB
-openai-v3-large-3072-100k:
-  similarity_function: COSINE
-  load_behavior: LEGACY_SCRUB
-openai-v3-large-1536-100k:
-  similarity_function: COSINE
-  load_behavior: LEGACY_SCRUB
-# ann-benchmarks
-glove-25-angular.hdf5:
-  similarity_function: COSINE
-  load_behavior: LEGACY_SCRUB
-glove-50-angular.hdf5:
-  similarity_function: COSINE
-  load_behavior: LEGACY_SCRUB
-lastfm-64-dot.hdf5:
-  similarity_function: DOT_PRODUCT
-  load_behavior: LEGACY_SCRUB
-glove-100-angular.hdf5:
-  similarity_function: COSINE
-  load_behavior: LEGACY_SCRUB
-glove-200-angular.hdf5:
-  similarity_function: COSINE
-  load_behavior: LEGACY_SCRUB
-nytimes-256-angular.hdf5:
-  similarity_function: COSINE
-  load_behavior: LEGACY_SCRUB
-sift-128-euclidean.hdf5:
-  similarity_function: EUCLIDEAN
-  load_behavior: LEGACY_SCRUB
\ No newline at end of file
diff --git a/jvector-examples/yaml-configs/datasets.yml b/jvector-examples/yaml-configs/datasets.yml
index a35555704..58d23116e 100644
--- a/jvector-examples/yaml-configs/datasets.yml
+++ b/jvector-examples/yaml-configs/datasets.yml
@@ -1,23 +1,30 @@
-neighborhood-watch-100k:
+jvector-100k:
   - cohere-english-v3-100k
   - ada002-100k
-  - openai-v3-small-100k
+  - openai-v3-small-1536-100k
   - gecko-100k
   - openai-v3-large-3072-100k
   - openai-v3-large-1536-100k
   - e5-small-v2-100k
   - e5-base-v2-100k
   - e5-large-v2-100k
-neighborhood-watch-1M:
+jvector-1M:
   - ada002-1M
   - colbert-1M
 ann-benchmarks:
-  - glove-25-angular.hdf5
-  - glove-50-angular.hdf5
-  - lastfm-64-dot.hdf5
-  - glove-100-angular.hdf5
-  - glove-200-angular.hdf5
-  - nytimes-256-angular.hdf5
-  - sift-128-euclidean.hdf5
-  # - deep-image-96-angular.hdf5 # large files not yet supported
-  # - gist-960-euclidean.hdf5 # large files not yet supported
\ No newline at end of file
+  - glove-25-angular
+  - glove-50-angular
+  - lastfm-64-dot
+  - glove-100-angular
+  - glove-200-angular
+  - nytimes-256-angular
+  - sift-128-euclidean
+#other-datasets:
+#  - dpr-1M
+#  - dpr-10M
+#  - cap-1M
+#  - cap-6M
+#  - cohere-english-v3-1M
+#  - cohere-english-v3-10M
+# - deep-image-96-angular # large files not yet supported
+# - gist-960-euclidean # large files not yet supported
\ No newline at end of file
diff --git a/jvector-examples/yaml-configs/autoDefault.yml b/jvector-examples/yaml-configs/index-parameters/autoDefault.yml
similarity index 100%
rename from jvector-examples/yaml-configs/autoDefault.yml
rename to jvector-examples/yaml-configs/index-parameters/autoDefault.yml
diff --git a/jvector-examples/yaml-configs/colbert-1M.yml b/jvector-examples/yaml-configs/index-parameters/colbert-1M.yml
similarity index 95%
rename from jvector-examples/yaml-configs/colbert-1M.yml
rename to jvector-examples/yaml-configs/index-parameters/colbert-1M.yml
index b9e6c72b7..d0de5e0f4 100644
--- a/jvector-examples/yaml-configs/colbert-1M.yml
+++ b/jvector-examples/yaml-configs/index-parameters/colbert-1M.yml
@@ -24,4 +24,4 @@ search:
   compression:
     - type: None
 
-# Run-level controls, such as benchmarks, console, and logging, are in run.yml.
\ No newline at end of file
+# Run-level controls, such as benchmarks, console, and logging, are in run-config.yml.
\ No newline at end of file
diff --git a/jvector-examples/yaml-configs/default.yml b/jvector-examples/yaml-configs/index-parameters/default.yml
similarity index 98%
rename from jvector-examples/yaml-configs/default.yml
rename to jvector-examples/yaml-configs/index-parameters/default.yml
index 346a701e4..b56e27ed0 100644
--- a/jvector-examples/yaml-configs/default.yml
+++ b/jvector-examples/yaml-configs/index-parameters/default.yml
@@ -36,4 +36,4 @@ search:
         centerData: No
         anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy)
 
-# Run-level controls, such as benchmarks, console, and logging, are in run.yml.
\ No newline at end of file
+# Run-level controls, such as benchmarks, console, and logging, are in run-config.yml.
\ No newline at end of file
diff --git a/jvector-examples/yaml-configs/glove-100-angular.yml b/jvector-examples/yaml-configs/index-parameters/glove-100-angular.yml
similarity index 100%
rename from jvector-examples/yaml-configs/glove-100-angular.yml
rename to jvector-examples/yaml-configs/index-parameters/glove-100-angular.yml
diff --git a/jvector-examples/yaml-configs/glove-200-angular.yml b/jvector-examples/yaml-configs/index-parameters/glove-200-angular.yml
similarity index 100%
rename from jvector-examples/yaml-configs/glove-200-angular.yml
rename to jvector-examples/yaml-configs/index-parameters/glove-200-angular.yml
diff --git a/jvector-examples/yaml-configs/glove-25-angular.yml b/jvector-examples/yaml-configs/index-parameters/glove-25-angular.yml
similarity index 100%
rename from jvector-examples/yaml-configs/glove-25-angular.yml
rename to jvector-examples/yaml-configs/index-parameters/glove-25-angular.yml
diff --git a/jvector-examples/yaml-configs/glove-50-angular.yml b/jvector-examples/yaml-configs/index-parameters/glove-50-angular.yml
similarity index 100%
rename from jvector-examples/yaml-configs/glove-50-angular.yml
rename to jvector-examples/yaml-configs/index-parameters/glove-50-angular.yml
diff --git a/jvector-examples/yaml-configs/lastfm-64-dot.yml b/jvector-examples/yaml-configs/index-parameters/lastfm-64-dot.yml
similarity index 100%
rename from jvector-examples/yaml-configs/lastfm-64-dot.yml
rename to jvector-examples/yaml-configs/index-parameters/lastfm-64-dot.yml
diff --git a/jvector-examples/yaml-configs/nytimes-256-angular.yml b/jvector-examples/yaml-configs/index-parameters/nytimes-256-angular.yml
similarity index 100%
rename from jvector-examples/yaml-configs/nytimes-256-angular.yml
rename to jvector-examples/yaml-configs/index-parameters/nytimes-256-angular.yml
diff --git a/jvector-examples/yaml-configs/sift-128-euclidean.yml b/jvector-examples/yaml-configs/index-parameters/sift-128-euclidean.yml
similarity index 100%
rename from jvector-examples/yaml-configs/sift-128-euclidean.yml
rename to jvector-examples/yaml-configs/index-parameters/sift-128-euclidean.yml
diff --git a/jvector-examples/yaml-configs/run.yml b/jvector-examples/yaml-configs/run-config.yml
similarity index 100%
rename from jvector-examples/yaml-configs/run.yml
rename to jvector-examples/yaml-configs/run-config.yml
diff --git a/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java b/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java
index 8e9cc712f..28127fb34 100644
--- a/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java
+++ b/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java
@@ -17,7 +17,7 @@
 
 
 import io.github.jbellis.jvector.example.benchmarks.datasets.DataSet;
-import io.github.jbellis.jvector.example.benchmarks.datasets.DataSetLoaderHDF5;
+import io.github.jbellis.jvector.example.benchmarks.datasets.DataSets;
 import io.github.jbellis.jvector.graph.GraphIndexBuilder;
 import io.github.jbellis.jvector.graph.ListRandomAccessVectorValues;
 import org.openjdk.jmh.annotations.Benchmark;
@@ -44,8 +44,8 @@ public static class Parameters {
         final ListRandomAccessVectorValues ravv;
 
         public Parameters() {
-            this.ds = new DataSetLoaderHDF5().loadDataSet("hdf5/glove-100-angular.hdf5").orElseThrow(
-                    () -> new RuntimeException("Unable to load dataset: hdf5/glove-100-angular.hdf5" )
+            this.ds = DataSets.loadDataSet("glove-100-angular").orElseThrow(
+                    () -> new RuntimeException("Unable to load dataset: glove-100-angular")
             ).getDataSet();
             this.ravv = new ListRandomAccessVectorValues(ds.getBaseVectors(), ds.getBaseVectors().get(0).length());
         }
diff --git a/rat-excludes.txt b/rat-excludes.txt
index 64aeba7a3..436c97822 100644
--- a/rat-excludes.txt
+++ b/rat-excludes.txt
@@ -24,6 +24,11 @@ src/test/resources/log4j2-test.xml
 results.csv
 scripts/test_node_setup.sh
 scripts/jmh_results_formatter.py
-yaml-configs/*.yml
+yaml-configs/**/*.yaml
+yaml-configs/**/*.yml
 src/main/resources/logback.xml
 docs/**/*.md
+yaml-configs/**/*.md
+local_datasets/**
+**/datasets/**
+