diff --git a/.github/workflows/run-bench.yml b/.github/workflows/run-bench.yml index 19762aaa5..6919b0e56 100644 --- a/.github/workflows/run-bench.yml +++ b/.github/workflows/run-bench.yml @@ -126,6 +126,20 @@ jobs: ref: ${{ matrix.branch }} fetch-depth: 0 + # ========================================== + # Decode and write the protected dataset catalog + # + # TO UPDATE THIS SECRET: + # 1. On your local machine, run: + # base64 -i jvector-examples/yaml-configs/dataset-catalogs/protected-catalog.yaml + # 2. Go to GitHub Repo -> Settings -> Secrets and variables -> Actions + # 3. Update the PROTECTED_CATALOG_YAML secret with the new Base64 string. + # ========================================== + - name: Inject Protected Catalog + run: | + mkdir -p jvector-examples/yaml-configs/dataset-catalogs + echo "${{ secrets.PROTECTED_CATALOG_YAML }}" | base64 -d > jvector-examples/yaml-configs/dataset-catalogs/protected-catalog.yaml + # Create a directory to store benchmark results - name: Create results directory run: mkdir -p benchmark_results @@ -137,8 +151,6 @@ jobs: # Run the benchmark if jvector-examples exists - name: Run benchmark id: run-benchmark - env: - DATASET_HASH: ${{ secrets.DATASETS_KEYPATH }} run: | # Check if jvector-examples directory and AutoBenchYAML class exist if [ ! -d "jvector-examples" ]; then diff --git a/.gitignore b/.gitignore index ea443370d..4b5599f84 100644 --- a/.gitignore +++ b/.gitignore @@ -3,10 +3,19 @@ local/ .mvn/wrapper/maven-wrapper.jar .java-version .bob/ +dataset_ +**/local_datasets/** ### Bench caches pq_cache/ index_cache/ +dataset_cache/ + +### Data catalogs +jvector-examples/yaml-configs/dataset-catalogs/*.yaml +jvector-examples/yaml-configs/dataset-catalogs/*.yml +!jvector-examples/yaml-configs/dataset-catalogs/public-catalog.yaml +jvector-examples/yaml-configs/dataset-catalogs/.catalog-cache/ ### Logging (or whatever you use) logging/ @@ -49,3 +58,5 @@ hdf5/ # JMH generated files dependency-reduced-pom.xml results.csv +**/datasets/custom/** +**/dataset_cache/** diff --git a/README.md b/README.md index cb9843336..10e9eb738 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,7 @@ You may also use method-level filtering and patterns, e.g., (The `failIfNoSpecifiedTests` option works around a quirk of surefire: it is happy to run `test` with submodules with empty test sets, but as soon as you supply a filter, it wants at least one match in every submodule.) -You can run `SiftSmall` and `Bench` directly to get an idea of what all is going on here. `Bench` will automatically download required datasets to the `fvec` and `hdf5` directories. +You can run `SiftSmall` and `Bench` directly to get an idea of what all is going on here. `Bench` will automatically download required datasets to the `dataset_cache` directory. The files used by `SiftSmall` can be found in the [siftsmall directory](./siftsmall) in the project root. To run either class, you can use the Maven exec-plugin via the following incantations: diff --git a/docs/benchmarking.md b/docs/benchmarking.md index e27113dc1..79ddba82e 100644 --- a/docs/benchmarking.md +++ b/docs/benchmarking.md @@ -4,21 +4,19 @@ JVector comes with a built-in benchmarking system in `jvector-examples/.../Bench To run a benchmark - Decide which dataset(s) you want to benchmark. A dataset consists of - - The vectors to be indexed, usually called the "base" or "target" vectors. - - The query vectors. - - The "ground truth" results which are used to compute accuracy metrics. - - The similarity metric which should have been used to compute the ground truth (dot product, cosine similarity or L2 distance). -- Configure the parameters combinations for which you want to run the benchmark. This includes graph index parameters, quantization parameters and search parameters. + - The vectors to be indexed, usually called the "base" or "target" vectors + - The query vectors + - The "ground truth" results that are used to compute accuracy metrics + - The similarity metric used compute the ground truth (dot product, cosine similarity or L2 distance) +- Configure the parameters combinations for which you want to run the benchmark. This includes index construction parameters, quantization parameters and search parameters. -JVector supports two types of datasets: -- **Fvec/Ivec**: The dataset consists of three files, for example `base.fvec`, `queries.fvec` and `neighbors.ivec` containing the base vectors, query vectors, and ground truth. (`fvec` and `ivec` file formats are described [here](http://corpus-texmex.irisa.fr/)) -- **HDF5**: The dataset consists of a single HDF5 file with three datasets labelled `train`, `test` and `neighbors`, representing the base vectors, query vectors and the ground truth. +JVector supports datasets in the fvecs/ivecs format. These consist of three files, for example `base.fvecs`, `queries.fvecs` and `neighbors.ivecs` containing the base vectors, query vectors, and ground truth. (`fvecs` and `ivecs` file formats are described [here](http://corpus-texmex.irisa.fr/)) The general procedure for running benchmarks is mentioned below. The following sections describe the process in more detail. - [Specify the dataset](#specifying-datasets) names to benchmark in `datasets.yml`. - Certain datasets will be downloaded automatically. If using a different dataset, make sure the dataset files are downloaded and made available (refer the section on [Custom datasets](#custom-datasets)). -- Adjust the benchmark parameters in `default.yml`. This will affect the parameters for all datasets to be benchmarked. You can specify custom parameters for a specific dataset by creating a file called `.yml` in the same folder. -- Decide on the kind of measurements and logging you want and configure them in `run.yml`. +- Adjust the benchmark parameters in `default.yml`. This will affect the parameters for all datasets benchmarked. You can specify custom parameters for a specific dataset by creating a file called `.yml` in the `index-parameters` subfolder. +- Decide on the kind of measurements and logging you want and configure them in `run-config.yml`. You can run the configured benchmark with maven: ```sh @@ -31,31 +29,28 @@ The datasets you want to benchmark should be specified in `jvector-examples/yaml To benchmark a single dataset, comment out the entries corresponding to all other datasets. (Or provide command line arguments as described in [Running `bench` from the command line](#running-bench-from-the-command-line)) -Datasets are assumed to be Fvec/Ivec based unless the entry in the `datasets.yml` ends with `.hdf5`. In this case, `.hdf5` is not considered part of the "dataset name" referenced in other sections. +Datasets are grouped into categories. The categories can be arbitrarily chosen for convenience and are not currently considered by the benchmarking system. -You'll notice that datasets are grouped into categories. The categories can be arbitrarily chosen for convenience and are not currently considered by the benchmarking system. - -For HDF5 files, the substrings `-angular`, `-euclidean` and `-dot` correspond to cosine similarity, L2 distance, and dot product similarity functions (these substrings ARE considered to be part of the "dataset name"). Currently, Fvec/Ivec datasets are implicitly assumed to use cosine similarity (changing this requires editing `DataSetLoaderMFD.java`). +Dataset similarity functions are configured in `jvector-examples/yaml-configs/dataset-metadata.yml`. Example `datasets.yml`: ```yaml category0: - - my-fvec-dataset # fvec/ivec dataset, cosine similarity - - my-hdf5-dataset-angular.hdf5 # hdf5 dataset, cosine similarity + - my-dataset-a + - my-dataset-b some-other-category: - - a-huge-dataset-1024d-euclidean.hdf5 # hdf5 dataset, L2 similarity - - my-simple-dataset-dot.hdf5 # hdf5 dataset, dot product similarity - - some-dataset-euclidean # fvec/ivec dataset, cosine similarity (NOT L2 unless you change the code!) + - another-dataset-a + - another-dataset-b ``` ## Setting benchmark parameters ### default.yml / \.yml -`jvector-examples/yaml-configs/default.yml` specifies the default index construction and search parameters to be used by `bench` for all datasets. +`jvector-examples/yaml-configs/index-parameters/default.yml` specifies the default index construction and search parameters to be used by `bench` for all datasets. -You can specify a custom set of a parameters for any given dataset by creating a file called `.yml`, with `` replaced by the actual name of the dataset. This is the same as the identifier used in `datasets.yml`, but without the `.hdf5` suffix for hdf5 datasets. The format of this file is exactly the same as `default.yml`. +You can specify a custom set of a parameters for any given dataset by creating a file called `.yml`, with `` replaced by the actual name of the dataset. This is the same as the identifier used in `datasets.yml`. The format of this file is exactly the same as `default.yml`. Refer to `default.yml` for a list of all options. @@ -67,7 +62,7 @@ construction: ``` will build and benchmark four graphs, one for each combination of M and ef in {(32, 100), (64, 100), (32, 200), (64, 200)}. This is particularly useful when running a Grid search to identify the best performing parameters. -### run.yml +### run-config.yml This file contains configurations for - Specifying the measurements you want to report, like QPS, latency and recall @@ -75,7 +70,7 @@ This file contains configurations for The configurations in this file are "run-level", meaning that they are shared across all the datasets being benchmarked. -See `run.yml` for a full list of all options. +See `run-config.yml` for a full list of all options. ## Running `bench` from the command line @@ -86,45 +81,37 @@ mvn compile exec:exec@bench -pl jvector-examples -am To benchmark a subset of the datasets in `datasets.yml`, you can provide a space-separated list of regexes as arguments. ```sh -# matches `glove-25-angular.hdf5`, `glove-50-angular.hdf5`, `nytimes-256-angular.hdf5` etc +# matches `glove-25-angular`, `glove-50-angular`, `nytimes-256-angular` etc mvn compile exec:exec@bench -pl jvector-examples -am -DbenchArgs="glove nytimes" ``` ## Custom Datasets -### Custom Fvec/Ivec datasets - -Using fvec/ivec datasets requires them to be configured in `DataSetLoaderMFD.java`. Some datasets are already pre-configured; these will be downloaded and used automatically on running the benchmark. - -To use a custom dataset consisting of files `base.fvec`, `queries.fvec` and `neighbors.ivec`, do the following: -- Ensure that you have three files: - - `base.fvec` containing N D-dimensional float vectors. These are used to build the index. - - `queries.fvec` containing Q D-dimensional float vectors. These are used for querying the built index. - - `neighbors.ivec` containing Q K-dimensional integer vectors, one for each query vector, representing the exact K-nearest neighbors for that query among the base vectors. - The files can be named however you like. -- Save all three files somewhere in the `fvec` directory in the root of the `jvector` repo (if it doesn't exist, create it). It's recommended to create at least one sub-folder with the name of the dataset and copy or move all three files there. -- Edit `DataSetLoaderMFD.java` to configure a new dataset and it's associated files: - ```java - put("cust-ds", new MultiFileDatasource("cust-ds", - "cust-ds/base.fvec", - "cust-ds/query.fvec", - "cust-ds/neighbors.ivec")); +Datasets are configured via YAML catalog files under `jvector-examples/yaml-configs/dataset-catalogs/`. The loader recursively discovers all `.yaml`/`.yml` files in that directory tree. See `jvector-examples/yaml-configs/dataset-catalogs/local-catalog.yaml` for the full format reference. + +To add a custom fvecs/ivecs dataset: + +1. Add a `.yaml` file to the YAML catalog directory, mapping your dataset name to its files: + ```yaml + _defaults: + cache_dir: ${DATASET_CACHE_DIR:-dataset_cache} + + my-dataset: + base: my_base_vectors.fvecs + query: my_query_vectors.fvecs + gt: my_ground_truth.ivecs + ``` +2. Place your fvecs/ivecs files at the paths you specified in the YAML (or specify a `cache_dir` / `base_url` to fetch them from a remote source). +3. Add the dataset's similarity function to `jvector-examples/yaml-configs/dataset-metadata.yml`: + ```yaml + my-dataset: + similarity_function: COSINE + load_behavior: NO_SCRUB ``` - The file paths are resolved relative to the `fvec` directory. `cust-ds` is the name of the dataset and can be changed to whatever is appropriate. -- In `jvector-examples/yaml-configs/datasets.yml`, add an entry corresponding to your custom dataset. Comment out other datasets which you do not want to benchmark. +4. Add the dataset name to `jvector-examples/yaml-configs/datasets.yml` so BenchYAML can find it: ```yaml custom: - - cust-ds + - my-dataset ``` -## Custom HDF5 datasets - -HDF5 datasets consist of a single file. The Hdf5Loader looks for three HDF5 datasets within the file, `train`, `test` and `neighbors`. These correspond to the base, query and neighbors vectors described above for fvec/ivec files. - -To use an HDF5 dataset, edit `jvector-examples/yaml-configs/datasets.yml` to add an entry like the following: -```yaml -category: - - .hdf5 -``` - -BenchYAML looks for hdf5 datasets with the name `.hdf5` in the `hdf5` folder in the root of this repo. If the file doesn't exist, BenchYAML will attempt to automatically download the dataset from ann-benchmarks.com. If your dataset is not from ann-benchmarks.com, simply ensure that the dataset is available in the `hdf5` folder and edit `datasets.yml` accordingly. +For remote datasets, use `base_url` to specify where files should be downloaded from. The `${VAR}` and `${VAR:-default}` syntax is supported for environment variable expansion. See the example config for details. diff --git a/jvector-examples/README.md b/jvector-examples/README.md index 27c09b5d4..c782f7db8 100644 --- a/jvector-examples/README.md +++ b/jvector-examples/README.md @@ -11,8 +11,8 @@ A simple benchmark for the sift dataset located in the [siftsmall](./siftsmall) Performs grid search across the `GraphIndexBuilder` parameter space to find the best tradeoffs between recall and throughput. -This benchmark requires datasets from [https://github.com/erikbern/ann-benchmarks](https://github.com/erikbern/ann-benchmarks/blob/main/README.md#data-sets) to be downloaded to hdf5 and fvec -directories `hdf5` or `fvec` under the project root depending on the dataset format. +This benchmark requires `fvecs' versions of datasets from [https://github.com/erikbern/ann-benchmarks](https://github.com/erikbern/ann-benchmarks/blob/main/README.md#data-sets) to be downloaded to `dataset_cache` +directory under the project root. You can use [`plot_output.py`](./plot_output.py) to graph the [pareto-optimal points](https://en.wikipedia.org/wiki/Pareto_efficiency) found by `Bench`. diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java index 343fcbd95..e066a34dc 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java @@ -94,11 +94,11 @@ public static void main(String[] args) throws IOException { RunConfig runCfg = RunConfig.loadDefault(); artifacts = RunArtifacts.open(runCfg, allConfigs); } catch (java.io.FileNotFoundException e) { - // Legacy yamlSchemaVersion "0" behavior: no run.yml + // Legacy yamlSchemaVersion "0" behavior: no run-config.yml // - logging disabled // - console shows compute selection // - compute selection comes from legacy search.benchmarks if present, else default - System.err.println("WARNING: run.yml not found. Falling back to deprecated legacy behavior: " + System.err.println("WARNING: run-config.yml not found. Falling back to deprecated legacy behavior: " + "no logging, console mirrors computed benchmarks."); Map> legacyBenchmarks = null; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java index 032ea2f6c..ea4752e4b 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java @@ -16,7 +16,7 @@ package io.github.jbellis.jvector.example; -import io.github.jbellis.jvector.example.benchmarks.datasets.DataSetLoaderMFD; +import io.github.jbellis.jvector.example.benchmarks.datasets.DataSets; import io.github.jbellis.jvector.example.reporting.RunArtifacts; import io.github.jbellis.jvector.example.yaml.MultiConfig; import io.github.jbellis.jvector.example.yaml.RunConfig; @@ -36,9 +36,8 @@ public static void main(String[] args) throws IOException { // Run-level policy config (benchmarks/console/logging + run metadata) RunConfig runCfg = RunConfig.loadDefault(); - // Load dataset - var ds = new DataSetLoaderMFD().loadDataSet(datasetName) - .orElseThrow(() -> new RuntimeException("dataset " + datasetName + " not found")) + var ds = DataSets.loadDataSet(datasetName).orElseThrow( + () -> new RuntimeException("dataset " + datasetName + " not found")) .getDataSet(); // Run artifacts + selections (sys_info/dataset_info/experiments.csv) diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java deleted file mode 100644 index 3c218c85f..000000000 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java +++ /dev/null @@ -1,164 +0,0 @@ -/* - * Copyright DataStax, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.github.jbellis.jvector.example.benchmarks.datasets; - -import io.github.jbellis.jvector.vector.VectorizationProvider; -import io.github.jbellis.jvector.vector.types.VectorFloat; -import io.github.jbellis.jvector.vector.types.VectorTypeSupport; -import io.jhdf.HdfFile; -import io.jhdf.api.Dataset; -import io.jhdf.object.datatype.FloatingPoint; - -import java.io.IOException; -import java.io.InputStream; -import java.net.HttpURLConnection; -import java.net.URL; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardCopyOption; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.stream.IntStream; - -/** - * This dataset loader will get and load hdf5 files from ann-benchmarks. - * - *

For curated benchmark datasets, properties are provided by - * {@code dataset_metadata.yml} via {@link DataSetMetadataReader}. If the metadata - * does not provide a similarity function, an error is thrown. - */ -public class DataSetLoaderHDF5 implements DataSetLoader { - public static final Path HDF5_DIR = Path.of("hdf5"); - private static final VectorTypeSupport vectorTypeSupport = VectorizationProvider.getInstance().getVectorTypeSupport(); - public static final String HDF5_EXTN = ".hdf5"; - private static final DataSetMetadataReader metadata = DataSetMetadataReader.load(); - - /** - * {@inheritDoc} - */ - public Optional loadDataSet(String datasetName) { - return maybeDownloadHdf5(datasetName).map(path -> { - var props = getProperties(datasetName); - props.similarityFunction() - .orElseThrow(() -> new IllegalArgumentException( - "No similarity function configured in dataset_metadata.yml for HDF5 dataset: " + datasetName)); - return new DataSetInfo(props, () -> readHdf5Data(path, props)); - }); - } - - /// Reads base vectors, query vectors, and ground truth from an HDF5 file - /// and returns a {@link DataSet} using the configured dataset properties. - private DataSet readHdf5Data(Path path, DataSetProperties props) { - VectorFloat[] baseVectors; - VectorFloat[] queryVectors; - var gtSets = new ArrayList>(); - try (HdfFile hdf = new HdfFile(path)) { - var baseVectorsArray = - (float[][]) hdf.getDatasetByPath("train").getData(); - baseVectors = IntStream.range(0, baseVectorsArray.length).parallel().mapToObj(i -> vectorTypeSupport.createFloatVector(baseVectorsArray[i])).toArray(VectorFloat[]::new); - Dataset queryDataset = hdf.getDatasetByPath("test"); - if (((FloatingPoint) queryDataset.getDataType()).getBitPrecision() == 64) { - // lastfm dataset contains f64 queries but f32 everything else - var doubles = ((double[][]) queryDataset.getData()); - queryVectors = IntStream.range(0, doubles.length).parallel().mapToObj(i -> { - var a = new float[doubles[i].length]; - for (int j = 0; j < doubles[i].length; j++) { - a[j] = (float) doubles[i][j]; - } - return vectorTypeSupport.createFloatVector(a); - }).toArray(VectorFloat[]::new); - } else { - var queryVectorsArray = (float[][]) queryDataset.getData(); - queryVectors = IntStream.range(0, queryVectorsArray.length).parallel().mapToObj(i -> vectorTypeSupport.createFloatVector(queryVectorsArray[i])).toArray(VectorFloat[]::new); - } - int[][] groundTruth = (int[][]) hdf.getDatasetByPath("neighbors").getData(); - gtSets = new ArrayList<>(groundTruth.length); - for (int[] i : groundTruth) { - var gtSet = new ArrayList(i.length); - for (int j : i) { - gtSet.add(j); - } - gtSets.add(gtSet); - } - } - - return DataSetUtils.processDataSet( - path.getFileName().toString(), - props, - Arrays.asList(baseVectors), - Arrays.asList(queryVectors), - gtSets); - } - - /// Looks up dataset properties in {@code dataset_metadata.yml}. - /// - /// @param datasetName the logical dataset name (without {@code .hdf5} extension) - /// @return the dataset properties, or a minimal name-only property set if no entry exists - private static DataSetProperties getProperties(String datasetName) { - return metadata.getProperties(datasetName) - .orElse(new DataSetProperties.PropertyMap(Map.of(DataSetProperties.KEY_NAME, datasetName))); - } - - /// Downloads the HDF5 file for the given dataset if it is not already present locally. - /// - /// @param datasetName the logical dataset name (without {@code .hdf5} extension) - /// @return the local path to the HDF5 file, or empty if the remote file was not found - private Optional maybeDownloadHdf5(String datasetName) { - var dsFilePath = HDF5_DIR.resolve(datasetName+HDF5_EXTN); - - if (Files.exists(dsFilePath)) { - return Optional.of(dsFilePath); - } - - // Download from https://ann-benchmarks.com/datasetName - var url = "https://ann-benchmarks.com/" + datasetName + HDF5_EXTN; - - HttpURLConnection connection; - while (true) { - int responseCode; - try { - connection = (HttpURLConnection) new URL(url).openConnection(); - responseCode = connection.getResponseCode(); - } catch (IOException e) { - throw new RuntimeException(e); - } - if (responseCode == HttpURLConnection.HTTP_NOT_FOUND) { - return Optional.empty(); - } - if (responseCode == HttpURLConnection.HTTP_MOVED_PERM || responseCode == HttpURLConnection.HTTP_MOVED_TEMP) { - String newUrl = connection.getHeaderField("Location"); - System.out.println("Redirect detected to URL: " + newUrl); - url = newUrl; - } else { - break; - } - } - - try (InputStream in = connection.getInputStream()) { - Files.createDirectories(dsFilePath.getParent()); - System.out.println("Downloading: " + url); - Files.copy(in, dsFilePath, StandardCopyOption.REPLACE_EXISTING); - } catch (IOException e) { - throw new RuntimeException("Error downloading data:" + e.getMessage(),e); - } - return Optional.of(dsFilePath); - } - -} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java deleted file mode 100644 index b38d2daf1..000000000 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java +++ /dev/null @@ -1,305 +0,0 @@ -/* - * Copyright DataStax, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.github.jbellis.jvector.example.benchmarks.datasets; - -import io.github.jbellis.jvector.example.util.SiftLoader; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import software.amazon.awssdk.auth.credentials.AnonymousCredentialsProvider; -import software.amazon.awssdk.http.crt.AwsCrtAsyncHttpClient; -import software.amazon.awssdk.regions.Region; -import software.amazon.awssdk.services.s3.S3AsyncClient; -import software.amazon.awssdk.services.s3.S3AsyncClientBuilder; -import software.amazon.awssdk.transfer.s3.S3TransferManager; -import software.amazon.awssdk.transfer.s3.model.CompletedFileDownload; -import software.amazon.awssdk.transfer.s3.model.DownloadFileRequest; -import software.amazon.awssdk.transfer.s3.model.FileDownload; -import software.amazon.awssdk.transfer.s3.progress.LoggingTransferListener; - -import java.io.BufferedInputStream; -import java.io.DataInputStream; -import java.io.FileInputStream; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.*; - -/** - * This dataset loader supports multi-file datasets which are comprised of several files as defined in - * {@link DataSetLoaderMFD.MultiFileDatasource}. - * - *

The vector similarity function is determined by looking up the dataset name in - * {@code dataset_metadata.yml} via {@link DataSetMetadataReader}. If no entry is found, - * an error is thrown. - */ -public class DataSetLoaderMFD implements DataSetLoader { - - private static final Logger logger = LoggerFactory.getLogger(DataSetLoaderMFD.class); - - private final static Set infraDatasets = Set.of("dpr-1M", "dpr-10M", "cap-1M", "cap-6M", "cohere-english-v3-1M", "cohere-english-v3-10M"); - private static final String infraBucketName = "jvector-datasets-infratest"; - private static final String fvecDir = "fvec"; - private static final String bucketName = "astra-vector"; - private static final List bucketNames = List.of(bucketName, infraBucketName); - private static final DataSetMetadataReader metadata = DataSetMetadataReader.load(); - - /** - * {@inheritDoc} - */ - public Optional loadDataSet(String fileName) { - return maybeDownloadFvecs(fileName).map(mfd -> { - var props = metadata.getProperties(mfd.name) - .orElseThrow(() -> new IllegalArgumentException( - "No metadata configured in dataset_metadata.yml for MFD dataset: " + mfd.name)); - props.similarityFunction() - .orElseThrow(() -> new IllegalArgumentException( - "No similarity_function configured in dataset_metadata.yml for MFD dataset: " + mfd.name)); - return new DataSetInfo(props, () -> mfd.load(props)); - }); - } - - /// Downloads the fvec/ivec files for the named dataset from S3 if not already present locally. - /// - /// @param name the logical dataset name - /// @return the datasource descriptor, or empty if the name is not a known multi-file dataset - private Optional maybeDownloadFvecs(String name) { - String bucket = infraDatasets.contains(name) ? infraBucketName : bucketName; - var mfd = MultiFileDatasource.byName.get(name); - if (mfd == null) { - logger.debug("MultiFileDatasource not found for name: [" + name + "]"); - return Optional.empty(); - } - logger.info("found dataset definition for {}", name); - - // TODO how to detect and recover from incomplete downloads? - - // get directory from paths in keys - Path fvecPath = Paths.get(fvecDir); - try { - Files.createDirectories(fvecPath.resolve(mfd.directory())); - } catch (IOException e) { - throw new RuntimeException("Failed to create directory: " + fvecDir, e); - } - - try (S3AsyncClient s3Client = s3AsyncClientBuilder().build()) { - S3TransferManager tm = S3TransferManager.builder().s3Client(s3Client).build(); - for (var pathFragment : mfd.paths()) { - Path localPath = fvecPath.resolve(pathFragment); - if (Files.exists(localPath)) { - continue; - } - - var urlPath = pathFragment.toString().replace('\\', '/'); - logger.info("Downloading dataset {} from {}", name, urlPath); - DownloadFileRequest downloadFileRequest = - DownloadFileRequest.builder() - .getObjectRequest(b -> b.bucket(bucket).key(urlPath)) - .addTransferListener(LoggingTransferListener.create()) - .destination(Paths.get(localPath.toString())) - .build(); - - // 3 retries - boolean downloaded = false; - for (int i = 0; i < 3; i++) { - try { - FileDownload downloadFile = tm.downloadFile(downloadFileRequest); - CompletedFileDownload downloadResult = downloadFile.completionFuture().join(); - long downloadedSize = Files.size(localPath); - - // Check if downloaded file size matches the expected size - if (downloadedSize != downloadResult.response().contentLength()) { - logger.error("Incomplete download (got {} of {} bytes). Retrying...", - downloadedSize, downloadResult.response().contentLength()); - Files.deleteIfExists(localPath); - continue; - } - - // Validate the file header to catch corrupt downloads - if (!validateVecFileHeader(localPath)) { - logger.error("Downloaded file {} has an invalid header; deleting and retrying", urlPath); - Files.deleteIfExists(localPath); - continue; - } - - logger.info("Downloaded file of length " + downloadedSize); - downloaded = true; - break; - } catch (Exception e) { - logger.error("Download attempt {} failed for {}: {}", i + 1, urlPath, e.getMessage()); - Files.deleteIfExists(localPath); - } - } - if (!downloaded) { - throw new IOException("Failed to download " + urlPath + " after 3 attempts"); - } - } - tm.close(); - } catch (Exception e) { - throw new RuntimeException("Error downloading data from S3: " + e.getMessage()); - } - - return Optional.of(mfd); - } - - /// Reads the first 4 bytes of a vec file (fvecs or ivecs) and checks that the - /// little-endian int32 dimension/count value is positive and reasonable. - private static boolean validateVecFileHeader(Path path) { - try (var dis = new DataInputStream(new BufferedInputStream(new FileInputStream(path.toFile())))) { - int dimension = Integer.reverseBytes(dis.readInt()); - return dimension > 0 && dimension <= 100_000; - } catch (IOException e) { - return false; - } - } - - /// Creates an S3 async client builder configured for anonymous access to US-EAST-1. - private static S3AsyncClientBuilder s3AsyncClientBuilder() { - return S3AsyncClient.builder() - .region(Region.US_EAST_1) - .httpClient(AwsCrtAsyncHttpClient.builder() - .maxConcurrency(16) - .build()) - .credentialsProvider(AnonymousCredentialsProvider.create()); - } - - /// Describes a dataset stored as three separate fvec/ivec files (base vectors, query - /// vectors, and ground truth) in an S3 bucket. Known datasets are registered in {@link #byName}. - public static class MultiFileDatasource { - public final String name; - public final Path basePath; - public final Path queriesPath; - public final Path groundTruthPath; - private final static String DATASET_HASH = System.getenv("DATASET_HASH"); - - public MultiFileDatasource(String name, String basePath, String queriesPath, String groundTruthPath) { - this.name = name; - this.basePath = Paths.get(basePath); - this.queriesPath = Paths.get(queriesPath); - this.groundTruthPath = Paths.get(groundTruthPath); - } - - /// Returns the parent directory of the base vectors file. - public Path directory() { - return basePath.getParent(); - } - - /// Returns the three file paths (base, queries, ground truth) that comprise this dataset. - public Iterable paths() { - return List.of(basePath, queriesPath, groundTruthPath); - } - - /// Reads the fvec/ivec files from disk and processes the dataset using the - /// configured dataset properties. - /// - /// @param props the dataset properties controlling similarity and load behavior - /// @return the loaded dataset - public DataSet load(DataSetProperties props) { - var baseVectors = SiftLoader.readFvecs("fvec/" + basePath); - var queryVectors = SiftLoader.readFvecs("fvec/" + queriesPath); - var gtVectors = SiftLoader.readIvecs("fvec/" + groundTruthPath); - return DataSetUtils.processDataSet(name, props, baseVectors, queryVectors, gtVectors); - } - - public static Map byName = new HashMap<>() {{ - put("degen-200k", new MultiFileDatasource("degen-200k", - "ada-degen/degen_base_vectors.fvec", - "ada-degen/degen_query_vectors.fvec", - "ada-degen/degen_ground_truth.ivec")); - put("cohere-english-v3-100k", new MultiFileDatasource("cohere-english-v3-100k", - "wikipedia_squad/100k/cohere_embed-english-v3.0_1024_base_vectors_100000.fvec", - "wikipedia_squad/100k/cohere_embed-english-v3.0_1024_query_vectors_10000.fvec", - "wikipedia_squad/100k/cohere_embed-english-v3.0_1024_indices_b100000_q10000_k100.ivec")); - put("cohere-english-v3-1M", new MultiFileDatasource("cohere-english-v3-1M", - DATASET_HASH + "/cohere/cohere_wikipedia_v3/cohere_wiki_en_flat_base_1m_norm.fvecs", - DATASET_HASH + "/cohere/cohere_wikipedia_v3/cohere_wiki_en_flat_query_10k_norm.fvecs", - DATASET_HASH + "/cohere/cohere_wikipedia_v3/cohere_wiki_en_flat_gt_1m_ip_k100.ivecs")); - put("cohere-english-v3-10M", new MultiFileDatasource("cohere-english-v3-10M", - DATASET_HASH + "/cohere/cohere_wikipedia_v3/cohere_wiki_en_flat_base_10m_norm.fvecs", - DATASET_HASH + "/cohere/cohere_wikipedia_v3/cohere_wiki_en_flat_query_10k_norm.fvecs", - DATASET_HASH + "/cohere/cohere_wikipedia_v3/cohere_wiki_en_flat_gt_10m_ip_k100.ivecs")); - put("colbert-10M", new MultiFileDatasource("colbert-10M", - "wikipedia_squad/10M/colbertv2.0_128_base_vectors_10000000.fvec", - "wikipedia_squad/10M/colbertv2.0_128_query_vectors_100000.fvec", - "wikipedia_squad/10M/colbertv2.0_128_indices_b10000000_q100000_k100.ivec")); - put("colbert-1M", new MultiFileDatasource("colbert-1M", - "wikipedia_squad/1M/colbertv2.0_128_base_vectors_1000000.fvec", - "wikipedia_squad/1M/colbertv2.0_128_query_vectors_100000.fvec", - "wikipedia_squad/1M/colbertv2.0_128_indices_b1000000_q100000_k100.ivec")); - put("nv-qa-v4-100k", new MultiFileDatasource("nv-qa-v4-100k", - "wikipedia_squad/100k/nvidia-nemo_1024_base_vectors_100000.fvec", - "wikipedia_squad/100k/nvidia-nemo_1024_query_vectors_10000.fvec", - "wikipedia_squad/100k/nvidia-nemo_1024_indices_b100000_q10000_k100.ivec")); - put("openai-v3-large-3072-100k", new MultiFileDatasource("openai-v3-large-3072-100k", - "wikipedia_squad/100k/text-embedding-3-large_3072_100000_base_vectors.fvec", - "wikipedia_squad/100k/text-embedding-3-large_3072_100000_query_vectors_10000.fvec", - "wikipedia_squad/100k/text-embedding-3-large_3072_100000_indices_query_10000.ivec")); - put("openai-v3-large-1536-100k", new MultiFileDatasource("openai-v3-large-1536-100k", - "wikipedia_squad/100k/text-embedding-3-large_1536_100000_base_vectors.fvec", - "wikipedia_squad/100k/text-embedding-3-large_1536_100000_query_vectors_10000.fvec", - "wikipedia_squad/100k/text-embedding-3-large_1536_100000_indices_query_10000.ivec")); - put("openai-v3-small-100k", new MultiFileDatasource("openai-v3-small-100k", - "wikipedia_squad/100k/text-embedding-3-small_1536_100000_base_vectors.fvec", - "wikipedia_squad/100k/text-embedding-3-small_1536_100000_query_vectors_10000.fvec", - "wikipedia_squad/100k/text-embedding-3-small_1536_100000_indices_query_10000.ivec")); - put("ada002-100k", new MultiFileDatasource("ada002-100k", - "wikipedia_squad/100k/ada_002_100000_base_vectors.fvec", - "wikipedia_squad/100k/ada_002_100000_query_vectors_10000.fvec", - "wikipedia_squad/100k/ada_002_100000_indices_query_10000.ivec")); - put("ada002-1M", new MultiFileDatasource("ada002-1M", - "wikipedia_squad/1M/ada_002_1000000_base_vectors.fvec", - "wikipedia_squad/1M/ada_002_1000000_query_vectors_10000.fvec", - "wikipedia_squad/1M/ada_002_1000000_indices_query_10000.ivec")); - put("e5-small-v2-100k", new MultiFileDatasource("e5-small-v2-100k", - "wikipedia_squad/100k/intfloat_e5-small-v2_100000_base_vectors.fvec", - "wikipedia_squad/100k/intfloat_e5-small-v2_100000_query_vectors_10000.fvec", - "wikipedia_squad/100k/intfloat_e5-small-v2_100000_indices_query_10000.ivec")); - put("e5-base-v2-100k", new MultiFileDatasource("e5-base-v2-100k", - "wikipedia_squad/100k/intfloat_e5-base-v2_100000_base_vectors.fvec", - "wikipedia_squad/100k/intfloat_e5-base-v2_100000_query_vectors_10000.fvec", - "wikipedia_squad/100k/intfloat_e5-base-v2_100000_indices_query_10000.ivec")); - put("e5-large-v2-100k", new MultiFileDatasource("e5-large-v2-100k", - "wikipedia_squad/100k/intfloat_e5-large-v2_100000_base_vectors.fvec", - "wikipedia_squad/100k/intfloat_e5-large-v2_100000_query_vectors_10000.fvec", - "wikipedia_squad/100k/intfloat_e5-large-v2_100000_indices_query_10000.ivec")); - put("gecko-100k", new MultiFileDatasource("gecko-100k", - "wikipedia_squad/100k/textembedding-gecko_100000_base_vectors.fvec", - "wikipedia_squad/100k/textembedding-gecko_100000_query_vectors_10000.fvec", - "wikipedia_squad/100k/textembedding-gecko_100000_indices_query_10000.ivec")); - put("gecko-1M", new MultiFileDatasource("gecko-1M", - "wikipedia_squad/1M/textembedding-gecko_1000000_base_vectors.fvec", - "wikipedia_squad/1M/textembedding-gecko_1000000_query_vectors_10000.fvec", - "wikipedia_squad/1M/textembedding-gecko_1000000_indices_query_10000.ivec")); - put("dpr-1M", new MultiFileDatasource("dpr-1M", - DATASET_HASH + "/dpr/c4-en_base_1M_norm_files0_2.fvecs", - DATASET_HASH + "/dpr/c4-en_query_10k_norm_files0_1.fvecs", - DATASET_HASH + "/dpr/dpr_1m_gt_norm_ip_k100.ivecs")); - put("dpr-10M", new MultiFileDatasource("dpr-10M", - DATASET_HASH + "/dpr/c4-en_base_10M_norm_files0_2.fvecs", - DATASET_HASH + "/dpr/c4-en_query_10k_norm_files0_1.fvecs", - DATASET_HASH + "/dpr/dpr_10m_gt_norm_ip_k100.ivecs")); - put("cap-1M", new MultiFileDatasource("cap-1M", - DATASET_HASH + "/cap/Caselaw_gte-Qwen2-1.5B_embeddings_base_1m_norm_shuffle.fvecs", - DATASET_HASH + "/cap/Caselaw_gte-Qwen2-1.5B_embeddings_query_10k_norm_shuffle.fvecs", - DATASET_HASH + "/cap/cap_1m_gt_norm_shuffle_ip_k100.ivecs")); - put("cap-6M", new MultiFileDatasource("cap-6M", - DATASET_HASH + "/cap/Caselaw_gte-Qwen2-1.5B_embeddings_base_6m_norm_shuffle.fvecs", - DATASET_HASH + "/cap/Caselaw_gte-Qwen2-1.5B_embeddings_query_10k_norm_shuffle.fvecs", - DATASET_HASH + "/cap/cap_6m_gt_norm_shuffle_ip_k100.ivecs")); - }}; - } -} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderSimpleMFD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderSimpleMFD.java new file mode 100644 index 000000000..5582e27e8 --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderSimpleMFD.java @@ -0,0 +1,928 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.jbellis.jvector.example.benchmarks.datasets; + +import io.github.jbellis.jvector.example.util.SiftLoader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.yaml.snakeyaml.Yaml; +import software.amazon.awssdk.auth.credentials.AnonymousCredentialsProvider; +import software.amazon.awssdk.regions.Region; +import software.amazon.awssdk.services.s3.S3AsyncClient; +import software.amazon.awssdk.transfer.s3.S3TransferManager; +import software.amazon.awssdk.transfer.s3.model.CompletedFileDownload; +import software.amazon.awssdk.transfer.s3.model.DownloadFileRequest; +import software.amazon.awssdk.transfer.s3.model.FileDownload; +import software.amazon.awssdk.transfer.s3.progress.LoggingTransferListener; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UncheckedIOException; +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.CompletableFuture; +import java.util.stream.Stream; + +/// A dataset loader that works with fvec/ivec datasets described by YAML catalog files +/// matching {@code *.yaml} or {@code *.yml}. +/// Supports S3, HTTP, local-only, and combined remote+local modes. +/// +/// ### Catalog format +/// +/// Each YAML catalog file lists datasets with their base, query, and ground truth +/// files. Optional fields control where files are stored and fetched: +/// +/// - {@code base_url} — overrides the default remote base URL for this entry +/// - {@code cache_dir} — overrides where files are cached locally (relative or absolute path) +/// +/// A special {@code _defaults} entry provides default values that are folded into all other +/// entries (unless the entry already specifies a value). Any root key starting with {@code _} +/// is excluded from dataset names. +/// +/// The environment variable {@code DATASET_CACHE_DIR} sets a global default cache directory +/// when no {@code cache_dir} is specified at any level. +/// +/// Field values may contain {@code ${VAR}} references to environment variables, which are +/// expanded at load time. The bash-style {@code ${VAR:-default}} syntax is supported to +/// provide a fallback value when the variable is not set. An {@link IllegalArgumentException} +/// is thrown if a referenced variable is not set and no default is provided. +/// +/// A special {@code _include} entry can reference a remote catalog URL. The remote catalog +/// is fetched and its raw contents are cached locally in a hidden snapshot file for offline use. +/// On each run, the effective included entries are rebuilt by applying the local +/// {@code _defaults} to the fetched (or cached) remote entries. Local entries in the same +/// wrapper file are processed afterward and therefore take precedence over included remote entries. +/// This lets a single local file act as a thin configuration wrapper around a remote catalog: +/// ```yaml +/// _defaults: +/// cache_dir: ${DATASET_CACHE_DIR:-fvec} +/// _include: +/// url: s3://bucket/datasets-clean/catalog_entries.yaml +/// ``` +/// +/// ```yaml +/// _defaults: +/// base_url: s3://my-bucket/${DATASET_HASH}/ +/// cache_dir: /data/cache +/// +/// ada002-100k: +/// base: ada_002_100k_base_99287.fvecs +/// query: ada_002_100k_query_10000.fvecs +/// gt: ada_002_100k_gt_ip_100.ivecs +/// +/// # private dataset with its own remote source and cache location +/// dpr-1M: +/// base_url: s3://my-bucket/SECRET_HASH/dpr/ +/// cache_dir: /fast-ssd/dpr +/// base: c4-en_base_1M_norm.fvecs +/// query: c4-en_query_10k_norm.fvecs +/// gt: dpr_1m_gt_norm_ip_k100.ivecs +/// ``` +/// Filenames are resolved relative to the entry's cache directory (local) or the base URL (remote). +/// When {@code base_url} is present on an entry, it is used instead of the loader's default remote +/// base URL for that entry's files. +/// +/// ### Usage patterns +/// +/// **Remote with local caching** — files are downloaded on first use and cached locally. +/// Subsequent runs use cached files. Set {@code checkForUpdates=true} to be warned when the +/// remote catalog changes. Supports both HTTP and S3 URLs. +/// ```java +/// var loader = new DataSetLoaderSimpleMFD( +/// "s3://bucket/datasets-clean/catalog_entries.yaml", +/// "fvec/catalog_entries.yaml", // local cache path +/// true // warn if remote catalog differs from local +/// ); +/// ``` +/// +/// **Local-only with recursive discovery** — the single-arg constructor accepts a directory +/// and recursively scans it for all {@code .yaml}/{@code .yml} files. This lets you organise +/// datasets in subdirectories, including private datasets with per-entry {@code base_url} overrides: +/// ``` +/// local_datasets/ +/// mydatasets/ +/// user_entries.yaml # your personal local datasets +/// private-infra/ +/// private_entries.yaml # private remote datasets with base_url per entry +/// ``` +/// ```java +/// var loader = new DataSetLoaderSimpleMFD("local_datasets"); +/// ``` +/// +/// **Remote+local hybrid** — if the local directory already contains {@code catalog_entries.yaml} +/// and data files, they are used as-is. Missing data files are downloaded from the remote. +/// ```java +/// var loader = new DataSetLoaderSimpleMFD( +/// "s3://bucket/datasets-clean/catalog_entries.yaml", +/// "/data/datasets/catalog_entries.yaml", +/// true +/// ); +/// ``` +/// +/// ### Metadata +/// +/// Dataset metadata (similarity function, load behavior) is resolved from +/// {@code dataset-metadata.yml} via {@link DataSetMetadataReader}. A custom metadata reader +/// can be provided via the 4-argument constructor. +/// +/// @see DataSetLoader +public class DataSetLoaderSimpleMFD implements DataSetLoader { + + private static final Logger logger = LoggerFactory.getLogger(DataSetLoaderSimpleMFD.class); + private static final String DEFAULT_CATALOG_FILENAME = "catalog_entries.yaml"; + private static final String CATALOG_GLOB = "*.{yaml,yml}"; + + // ======================================================================================== + // LOG REDACTION — auto-redacts secret-like path segments to prevent leakage + // ======================================================================================== + + /// Minimum number of hex characters (ignoring separators) for a path segment to be + /// considered a potential secret (hash, API key, token, etc.). + private static final int MIN_HEX_CHARS = 20; + + /// Set JVECTOR_LOG_REDACT=false to disable automatic redaction of secret-like path segments. + private static final boolean REDACT_ENABLED; + static { + String env = System.getenv("JVECTOR_LOG_REDACT"); + REDACT_ENABLED = !"false".equalsIgnoreCase(env); + } + + /// Redacts path segments that look like secrets (hashes, API keys, tokens) to prevent + /// accidental leakage in log output and exception messages. + /// + /// A path segment is redacted if it contains {@value #MIN_HEX_CHARS} or more hex + /// characters after stripping common separators ({@code -}, {@code .}, {@code _}) and + /// the {@code 0x} prefix. This catches SHA-1 (40), SHA-256 (64), API keys, and similar + /// patterns while preserving normal names like {@code datasets-clean} or {@code e5-base-v2-100k}. + /// + /// Set {@code JVECTOR_LOG_REDACT=false} to disable. + static String redact(Object value) { + if (value == null) return "null"; + if (!REDACT_ENABLED) return value.toString(); + String s = value.toString(); + if (s.isEmpty()) return s; + + var sb = new StringBuilder(s.length()); + int i = 0; + while (i < s.length()) { + // find the next path segment (delimited by / or \) + int segStart = i; + while (i < s.length() && s.charAt(i) != '/' && s.charAt(i) != '\\') { + i++; + } + String segment = s.substring(segStart, i); + sb.append(looksLikeSecret(segment) ? "[[redacted]]" : segment); + + // append the delimiter(s) + while (i < s.length() && (s.charAt(i) == '/' || s.charAt(i) == '\\')) { + sb.append(s.charAt(i)); + i++; + } + } + return sb.toString(); + } + + /// Returns true if the segment looks like a hash, token, or API key. + /// Strips common separators and 0x prefix, then counts hex characters. + private static boolean looksLikeSecret(String segment) { + if (segment.isEmpty()) return false; + + String stripped = segment; + // strip 0x or 0X prefix + if (stripped.startsWith("0x") || stripped.startsWith("0X")) { + stripped = stripped.substring(2); + } + + int hexCount = 0; + int totalSignificant = 0; // non-separator characters + for (int i = 0; i < stripped.length(); i++) { + char c = stripped.charAt(i); + if (c == '-' || c == '.' || c == '_') continue; // ignore separators + totalSignificant++; + if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) { + hexCount++; + } + } + + // must have enough hex chars and they must be the majority of significant chars + return hexCount >= MIN_HEX_CHARS && totalSignificant > 0 + && (double) hexCount / totalSignificant >= 0.75; + } + + /// Entry source. Local entries always take precedence over included remote entries. + private enum CatalogSource { + LOCAL, + INCLUDED_REMOTE + } + + /// Resolved entry in the merged catalog. Tracks where the entry came from so that + /// local file resolution, precedence, and per-entry remote base URL overrides work correctly. + private static class CatalogEntry { + final Map fields; + final Path cacheDir; // where data files are cached locally + final String baseUrl; // per-entry base_url override, or null + final CatalogSource source; + + CatalogEntry(Map fields, Path cacheDir, String baseUrl, CatalogSource source) { + this.fields = fields; + this.cacheDir = cacheDir; + this.baseUrl = baseUrl; + this.source = source; + } + } + + private static final String ENV_DATASET_CACHE_DIR = "DATASET_CACHE_DIR"; + + private final String remoteBasePath; + private final Map catalog; + private final Path localCacheDir; + private final DataSetMetadataReader metadata; + private final HttpClient httpClient; + + // S3 instances for connection pooling + private S3AsyncClient s3Client; + private S3TransferManager s3TransferManager; + + /// Creates a local-only loader that recursively discovers all {@code .yaml}/{@code .yml} + /// files under the given path. + /// + /// The {@code localPath} may be either a directory (scanned recursively for catalog files) + /// or the full path to a single catalog YAML file. + /// + /// If the path does not exist or contains no catalog files, the loader is constructed + /// successfully but will return empty for all dataset lookups. This allows it to be safely + /// registered in a loader list without failing when local datasets are not present. + /// + /// @param localPath the local directory to scan or full path to a catalog YAML file + public DataSetLoaderSimpleMFD(String localPath) { + this(null, localPath, false, DataSetMetadataReader.load()); + } + + /// Creates a loader using the default dataset metadata from {@code dataset-metadata.yml}. + /// + /// The {@code localPath} may be either a directory or the full path to a catalog YAML file. + /// If it ends in {@code .yaml} or {@code .yml}, that file is used as the catalog. + /// Otherwise, the directory is scanned recursively for all {@code .yaml}/{@code .yml} files. + /// + /// Entries without an explicit {@code cache_dir} default to {@code DATASET_CACHE_DIR} + /// when that environment variable is set; otherwise they default to the catalog file's + /// directory. In constructor-driven remote-catalog mode (when no local catalog exists and + /// {@code catalogUrl} is used), fetched remote entries default to {@code dataset_cache/}. + /// Entry-level and {@code _defaults}-level {@code cache_dir} values take precedence. + /// + /// @param catalogUrl the full URL (HTTP or S3) to the remote catalog, or null/empty + /// for local-only mode + /// @param localPath the local directory or full path to a catalog YAML file + /// @param checkForUpdates if true and a local catalog already exists, the remote catalog is + /// fetched and compared; a warning is logged if they differ + public DataSetLoaderSimpleMFD(String catalogUrl, String localPath, boolean checkForUpdates) { + this(catalogUrl, localPath, checkForUpdates, DataSetMetadataReader.load()); + } + + /// Creates a loader with a custom metadata reader for resolving dataset properties. + /// + /// @param catalogUrl the full URL (HTTP or S3) to the remote catalog, or null/empty + /// for local-only mode + /// @param localPath the local directory or full path to a catalog YAML file + /// @param checkForUpdates if true and a local catalog already exists, the remote catalog is + /// fetched and compared; a warning is logged if they differ. + /// Ignored when catalogUrl is null/empty. + /// @param metadata the metadata reader for resolving dataset properties + public DataSetLoaderSimpleMFD(String catalogUrl, String localPath, boolean checkForUpdates, DataSetMetadataReader metadata) { + this.metadata = metadata; + this.httpClient = HttpClient.newBuilder() + .followRedirects(HttpClient.Redirect.NORMAL) + .build(); + + // resolve localPath for catalog discovery. For discovered local/include catalogs, + // entries without an explicit cache_dir fall back to DATASET_CACHE_DIR or the + // catalog file's directory. Pure constructor-driven remote catalogs fall back to + // dataset_cache. + Path resolvedPath = Paths.get(localPath); + Path localCatalog; + this.localCacheDir = Paths.get("dataset_cache"); + + if (localPath.endsWith(".yaml") || localPath.endsWith(".yml")) { + localCatalog = resolvedPath; + } else { + localCatalog = resolvedPath.resolve(DEFAULT_CATALOG_FILENAME); + } + + // determine whether we have a remote URL (S3 or HTTP) + boolean isRemote = catalogUrl != null && !catalogUrl.isEmpty() + && (catalogUrl.startsWith("http://") || catalogUrl.startsWith("https://") || catalogUrl.startsWith("s3://")); + + // derive remote base path by stripping the filename from the catalog URL + if (isRemote) { + int lastSlash = catalogUrl.lastIndexOf('/'); + this.remoteBasePath = catalogUrl.substring(0, lastSlash + 1); + } else { + this.remoteBasePath = null; + } + + // load local catalog entries — either from a single file or by scanning a directory tree + Map localEntries = new HashMap<>(); + if (localPath.endsWith(".yaml") || localPath.endsWith(".yml")) { + // single file mode + if (Files.exists(localCatalog)) { + loadCatalogEntries(localCatalog, localEntries); + } + } else if (Files.isDirectory(resolvedPath)) { + // recursive scan mode + scanForCatalogs(resolvedPath, localEntries); + } else if (Files.exists(localCatalog)) { + // directory doesn't exist yet but might after remote fetch — check the default file + loadCatalogEntries(localCatalog, localEntries); + } + + if (!localEntries.isEmpty()) { + logger.info("Loaded {} datasets from local catalog(s) under {}", localEntries.size(), redact(localCacheDir)); + } + + if (isRemote) { + if (!localEntries.isEmpty()) { + this.catalog = localEntries; + if (checkForUpdates) checkRemoteCatalogForUpdates(catalogUrl, localEntries); + } else { + logger.info("No local catalog found, fetching from {}", redact(catalogUrl)); + var remoteCatalogData = fetchRemoteCatalogRaw(catalogUrl); + this.catalog = toCatalogEntries(remoteCatalogData, localCacheDir); + saveCatalogLocally(localCatalog, catalogUrl, remoteCatalogData); + } + } else { + if (!localEntries.isEmpty()) { + this.catalog = localEntries; + } else { + logger.info("No catalog found under {}. This loader will not match any datasets.", redact(localCacheDir)); + this.catalog = Map.of(); + } + } + } + + @Override + public Optional loadDataSet(String dataSetName) { + var entry = catalog.get(dataSetName); + if (entry == null) return Optional.empty(); + + var baseFile = entry.fields.get("base"); + var queryFile = entry.fields.get("query"); + var gtFile = entry.fields.get("gt"); + if (baseFile == null || queryFile == null || gtFile == null) { + logger.error("Dataset '{}' is missing required fields (base, query, gt) in catalog", dataSetName); + return Optional.empty(); + } + + logger.info("Found dataset '{}' in catalog", dataSetName); + var startTime = System.nanoTime(); + + // determine the effective remote base URL and local cache directory for this entry + String effectiveBaseUrl = entry.baseUrl != null ? entry.baseUrl : remoteBasePath; + Path effectiveCacheDir = entry.cacheDir; + + // Execute downloads simultaneously to maximize network bandwidth + try { + var f1 = CompletableFuture.runAsync(() -> ensureQuietly(baseFile, effectiveCacheDir, effectiveBaseUrl)); + var f2 = CompletableFuture.runAsync(() -> ensureQuietly(queryFile, effectiveCacheDir, effectiveBaseUrl)); + var f3 = CompletableFuture.runAsync(() -> ensureQuietly(gtFile, effectiveCacheDir, effectiveBaseUrl)); + + CompletableFuture.allOf(f1, f2, f3).join(); + } catch (Exception e) { + throw new RuntimeException("Failed to obtain dataset files for " + dataSetName, e); + } + + logger.info("Dataset files ready for '{}' in {}s", dataSetName, String.format("%.2f", (System.nanoTime() - startTime) / 1e9)); + + var props = metadata.getProperties(dataSetName) + .orElseThrow(() -> new IllegalArgumentException( + String.format( + "Dataset '%s' was found in dataset catalog, but no metadata entry was found in dataset-metadata.yml. ", + dataSetName))); + return Optional.of(new DataSetInfo(props, () -> { + var baseVectors = SiftLoader.readFvecs(effectiveCacheDir.resolve(baseFile).toString()); + var queryVectors = SiftLoader.readFvecs(effectiveCacheDir.resolve(queryFile).toString()); + var gtVectors = SiftLoader.readIvecs(effectiveCacheDir.resolve(gtFile).toString()); + return DataSetUtils.processDataSet(dataSetName, props, baseVectors, queryVectors, gtVectors); + })); + } + + // ======================================================================================== + // CATALOG DISCOVERY & LOADING + // ======================================================================================== + + /// Returns the effective source for a discovered catalog file. + /// Generated remote-catalog snapshots are treated as included remote entries so that + /// real local catalogs continue to take precedence across runs. + private static CatalogSource catalogSource(Map> raw) { + Map meta = raw.get("_meta"); + if (meta != null && "true".equalsIgnoreCase(meta.get("generated_remote_catalog"))) { + return CatalogSource.INCLUDED_REMOTE; + } + return CatalogSource.LOCAL; + } + + /// Inserts an entry while preserving the precedence rule that real local entries + /// always win over included remote entries. + private static void putCatalogEntry(Map target, String name, CatalogEntry entry) { + CatalogEntry existing = target.get(name); + if (existing == null || entry.source == CatalogSource.LOCAL || existing.source != CatalogSource.LOCAL) { + target.put(name, entry); + } + } + + /// Returns the hidden cache file used to persist the raw contents of an included remote catalog. + private static Path includeCacheFile(Path catalogDir, String includeUrl) { + return catalogDir.resolve(".catalog-cache") + .resolve("include-" + sha256Hex(includeUrl) + ".yaml.cache"); + } + + private static String sha256Hex(String value) { + try { + MessageDigest digest = MessageDigest.getInstance("SHA-256"); + byte[] bytes = digest.digest(value.getBytes(StandardCharsets.UTF_8)); + StringBuilder hex = new StringBuilder(bytes.length * 2); + for (byte b : bytes) { + hex.append(Character.forDigit((b >> 4) & 0xF, 16)); + hex.append(Character.forDigit(b & 0xF, 16)); + } + return hex.toString(); + } catch (NoSuchAlgorithmException e) { + throw new IllegalStateException("SHA-256 should always be available", e); + } + } + + /// Recursively scans a directory tree for {@code .yaml}/{@code .yml} files and merges + /// all entries into the given map. Later entries of the same source type may override + /// earlier ones, but real local entries always take precedence over included remote entries. + private void scanForCatalogs(Path rootDir, Map target) { + try (Stream paths = Files.walk(rootDir)) { + var matcher = rootDir.getFileSystem().getPathMatcher("glob:" + CATALOG_GLOB); + paths.filter(p -> p.getFileName() != null && matcher.matches(p.getFileName())) + .forEach(catalogFile -> loadCatalogEntries(catalogFile, target)); + } catch (IOException e) { + logger.warn("Error scanning for catalogs under {}: {}", redact(rootDir), redact(e.getMessage())); + } + } + + /// Loads entries from a single catalog file into the target map. + /// Handles {@code _defaults} folding, {@code _include} remote fetching, and + /// {@code _}-prefixed key exclusion. + /// + /// When {@code _include} is present, its value (after env var expansion) is treated as a + /// remote catalog URL. The remote entries are fetched and merged with the local defaults, + /// so a single local file can act as a thin wrapper around a remote catalog. + private void loadCatalogEntries(Path catalogFile, Map target) { + var raw = loadCatalogFromFile(catalogFile); + if (raw.isEmpty()) return; + + Path catalogDir = catalogFile.getParent() != null ? catalogFile.getParent() : Paths.get("."); + CatalogSource source = catalogSource(raw); + + // extract and expand _defaults if present + Map defaults = raw.getOrDefault("_defaults", Map.of()); + if (!defaults.isEmpty()) { + defaults = resolveEnvVars(defaults); + } + + // handle _include: fetch remote catalog and merge with local defaults + Map includeEntry = raw.get("_include"); + if (includeEntry != null) { + String includeUrl = includeEntry.get("url"); + if (includeUrl != null) { + includeUrl = expandEnvVars(includeUrl); + loadRemoteInclude(includeUrl, defaults, catalogDir, includeCacheFile(catalogDir, includeUrl), target); + } + } + + // count real entries (non-underscore keys) + long entryCount = raw.keySet().stream().filter(k -> !k.startsWith("_")).count(); + if (entryCount > 0) { + logger.info("Loading catalog from {} ({} entries)", redact(catalogFile), entryCount); + } + + for (var e : raw.entrySet()) { + String name = e.getKey(); + // skip entries whose key starts with _ + if (name.startsWith("_")) continue; + + // fold defaults into this entry (entry values take precedence) + Map fields = new HashMap<>(defaults); + if (e.getValue() != null) { + fields.putAll(e.getValue()); + } + + putCatalogEntry(target, name, buildCatalogEntry(fields, catalogDir, source)); + } + } + + /// Fetches a remote catalog via {@code _include}, caches its raw contents locally for + /// offline reuse, and merges the resulting entries with the local defaults. If the remote + /// fetch fails and a cached snapshot exists, the cached catalog is used instead. + private void loadRemoteInclude(String includeUrl, Map defaults, + Path catalogDir, Path cachedIncludeFile, + Map target) { + Map> remoteCatalog; + boolean usedCachedSnapshot = false; + + try { + logger.info("Including remote catalog from {}", redact(includeUrl)); + remoteCatalog = fetchRemoteCatalogRaw(includeUrl, cachedIncludeFile); + } catch (Exception e) { + if (!Files.isRegularFile(cachedIncludeFile)) { + logger.warn("Failed to include remote catalog from {}: {}", redact(includeUrl), redact(e.getMessage())); + return; + } + + logger.warn("Failed to include remote catalog from {}: {}. Using cached catalog {}", + redact(includeUrl), redact(e.getMessage()), redact(cachedIncludeFile)); + remoteCatalog = loadCatalogFromFile(cachedIncludeFile); + usedCachedSnapshot = true; + } + + // derive the remote base path from the include URL + int lastSlash = includeUrl.lastIndexOf('/'); + String remoteBase = lastSlash >= 0 ? includeUrl.substring(0, lastSlash + 1) : null; + + long entryCount = 0; + for (var e : remoteCatalog.entrySet()) { + if (e.getKey().startsWith("_")) continue; + entryCount++; + + // fold local defaults into remote entry (remote values take precedence over defaults, + // but local entries always take precedence — those are handled in the caller's loop) + Map fields = new HashMap<>(defaults); + if (e.getValue() != null) { + fields.putAll(e.getValue()); + } + // if the entry doesn't already have a base_url, use the remote catalog's base path + if (!fields.containsKey("base_url") && remoteBase != null) { + fields.put("base_url", remoteBase); + } + + putCatalogEntry(target, e.getKey(), buildCatalogEntry(fields, catalogDir, CatalogSource.INCLUDED_REMOTE)); + } + + logger.info("Included {} datasets from {} catalog", entryCount, + usedCachedSnapshot ? "cached" : "remote"); + } + + /// Converts a raw catalog map (from a remote fetch) into CatalogEntry objects. + /// Handles {@code _defaults} folding and {@code _}-prefixed key exclusion. + private static Map toCatalogEntries(Map> raw, Path localDir) { + Map defaults = raw.getOrDefault("_defaults", Map.of()); + + var result = new HashMap(); + for (var e : raw.entrySet()) { + if (e.getKey().startsWith("_")) continue; + + Map fields = new HashMap<>(defaults); + if (e.getValue() != null) { + fields.putAll(e.getValue()); + } + + putCatalogEntry(result, e.getKey(), buildCatalogEntry(fields, localDir, CatalogSource.INCLUDED_REMOTE)); + } + return result; + } + + private static final java.util.Set KNOWN_FIELDS = java.util.Set.of( + "base", "query", "gt", "base_url", "cache_dir" + ); + + /// Builds a CatalogEntry from merged fields, resolving env vars, base_url, and cache_dir. + /// Throws if any unknown fields are present. + private static CatalogEntry buildCatalogEntry(Map fields, Path catalogDir, CatalogSource source) { + // validate that all fields are recognized + for (String key : fields.keySet()) { + if (!KNOWN_FIELDS.contains(key)) { + throw new IllegalArgumentException( + "Unknown field '" + key + "' in catalog entry. Known fields: " + KNOWN_FIELDS); + } + } + + // expand ${VAR} references in all field values + var resolved = resolveEnvVars(fields); + + String baseUrl = resolved.get("base_url"); + if (baseUrl != null && !baseUrl.endsWith("/")) { + baseUrl = baseUrl + "/"; + } + + // resolve cache_dir: entry field > DATASET_CACHE_DIR env var > catalog file's directory + Path cacheDir; + String cacheDirField = resolved.get("cache_dir"); + if (cacheDirField != null && !cacheDirField.isEmpty()) { + cacheDir = Paths.get(cacheDirField); + } else { + String envCacheDir = System.getenv(ENV_DATASET_CACHE_DIR); + if (envCacheDir != null && !envCacheDir.isEmpty()) { + cacheDir = Paths.get(envCacheDir); + } else { + cacheDir = catalogDir; + } + } + + return new CatalogEntry(resolved, cacheDir, baseUrl, source); + } + + /// Matches {@code ${VAR}} and {@code ${VAR:-default}} syntax. + private static final java.util.regex.Pattern ENV_VAR_PATTERN = + java.util.regex.Pattern.compile("\\$\\{([^:}]+)(?::-((?:[^}]*)?))?}"); + + /// Expands {@code ${VAR}} and {@code ${VAR:-default}} references in all field values + /// using environment variables. Throws {@link IllegalArgumentException} if a referenced + /// variable is not set and no default is provided. + private static Map resolveEnvVars(Map fields) { + var resolved = new HashMap(fields.size()); + for (var e : fields.entrySet()) { + resolved.put(e.getKey(), expandEnvVars(e.getValue())); + } + return resolved; + } + + /// Expands all {@code ${VAR}} and {@code ${VAR:-default}} occurrences in a single string value. + private static String expandEnvVars(String value) { + if (value == null || !value.contains("${")) { + return value; + } + var matcher = ENV_VAR_PATTERN.matcher(value); + var sb = new StringBuilder(); + while (matcher.find()) { + String varName = matcher.group(1); + String defaultValue = matcher.group(2); // null if no :- was present + String envValue = System.getenv(varName); + if (envValue == null) { + if (defaultValue != null) { + envValue = defaultValue; + } else { + throw new IllegalArgumentException( + "Environment variable '${" + varName + "}' referenced in catalog entry is not set"); + } + } + matcher.appendReplacement(sb, java.util.regex.Matcher.quoteReplacement(envValue)); + } + matcher.appendTail(sb); + return sb.toString(); + } + + // ======================================================================================== + // FILE AVAILABILITY + // ======================================================================================== + + private void ensureQuietly(String filename, Path cacheDir, String baseUrl) { + try { + ensureFileAvailable(filename, cacheDir, baseUrl); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + /// Ensures a dataset file is available locally. Checks in the entry's cache directory first. + /// If not found and a remote base URL is available (either per-entry or loader-level), + /// downloads the file. + private void ensureFileAvailable(String filename, Path cacheDir, String baseUrl) throws IOException { + Path localPath = cacheDir.resolve(filename); + if (Files.exists(localPath)) return; + if (baseUrl == null) throw new IOException("File not found locally and no remote URL configured: " + redact(localPath)); + + Path parent = localPath.getParent(); + if (parent != null) { + Files.createDirectories(parent); + } + + String url = baseUrl + filename; + logger.info("Downloading {} -> {}", redact(url), redact(localPath)); + downloadUrlToFile(url, localPath); + } + + // ======================================================================================== + // REMOTE CATALOG OPERATIONS + // ======================================================================================== + + private Map> fetchRemoteCatalogRaw(String catalogUrl) { + return fetchRemoteCatalogRaw(catalogUrl, null); + } + + private Map> fetchRemoteCatalogRaw(String catalogUrl, Path snapshotFile) { + try { + Path tempDir = snapshotFile != null && snapshotFile.getParent() != null + ? snapshotFile.getParent() + : null; + if (tempDir != null) { + Files.createDirectories(tempDir); + } + + Path tempFile = tempDir != null + ? Files.createTempFile(tempDir, "catalog-", ".tmp") + : Files.createTempFile("catalog-", ".tmp"); + try { + downloadUrlToFile(catalogUrl, tempFile); + var catalog = loadCatalogFromFile(tempFile); + + if (snapshotFile != null) { + Files.move(tempFile, snapshotFile, + StandardCopyOption.ATOMIC_MOVE, + StandardCopyOption.REPLACE_EXISTING); + } + return catalog; + } finally { + Files.deleteIfExists(tempFile); + } + } catch (IOException e) { + throw new RuntimeException("Failed to fetch dataset catalog from " + redact(catalogUrl), e); + } + } + + private void saveCatalogLocally(Path localCatalog, String catalogUrl, + Map> catalogData) { + try { + Path parent = localCatalog.getParent() != null ? localCatalog.getParent() : Paths.get("."); + Files.createDirectories(parent); + + Path tempFile = Files.createTempFile(parent, "catalog-", ".tmp"); + try { + Map> annotated = new LinkedHashMap<>(); + Map meta = new LinkedHashMap<>(); + meta.put("generated_remote_catalog", "true"); + meta.put("remote_catalog_url", catalogUrl); + annotated.put("_meta", meta); + annotated.putAll(catalogData); + + Files.writeString(tempFile, new Yaml().dump(annotated)); + Files.move(tempFile, localCatalog, + StandardCopyOption.ATOMIC_MOVE, + StandardCopyOption.REPLACE_EXISTING); + } finally { + Files.deleteIfExists(tempFile); + } + } catch (Exception e) { + logger.warn("Failed to cache catalog locally: {}", redact(e.getMessage())); + } + } + + @SuppressWarnings("unchecked") + private static Map> loadCatalogFromFile(Path path) { + try (InputStream in = Files.newInputStream(path)) { + Map> result = new Yaml().load(in); + return result != null ? result : Map.of(); + } catch (IOException e) { + throw new RuntimeException("Failed to load catalog from " + redact(path), e); + } + } + + /// Fetches the remote catalog and compares it to the local one, logging a warning if they differ. + private void checkRemoteCatalogForUpdates(String catalogUrl, Map localEntries) { + try { + var remoteCatalogData = fetchRemoteCatalogRaw(catalogUrl); + // compare just the dataset names and file fields, ignoring localDir + boolean differs = false; + if (remoteCatalogData.size() != localEntries.size()) { + differs = true; + } else { + for (var e : remoteCatalogData.entrySet()) { + var local = localEntries.get(e.getKey()); + if (local == null || !local.fields.equals(e.getValue())) { + differs = true; + break; + } + } + } + if (differs) { + logger.warn("Remote catalog at {} differs from local catalog. Consider updating your local copy.", redact(catalogUrl)); + } + } catch (Exception e) { + logger.warn("Could not check remote catalog for updates: {}", redact(e.getMessage())); + } + } + + // ======================================================================================== + // TRANSPORT PROTOCOL ROUTING (S3 vs HTTP) + // ======================================================================================== + + private void downloadUrlToFile(String url, Path localPath) throws IOException { + if (url.startsWith("s3://")) { + downloadFileS3(url, localPath); + } else if (url.startsWith("http://") || url.startsWith("https://")) { + downloadFileHttp(url, localPath); + } else { + throw new IllegalArgumentException("Unsupported URL scheme for download: " + redact(url)); + } + } + + // ======================================================================================== + // S3 TRANSFER MANAGER IMPLEMENTATION + // ======================================================================================== + + private synchronized S3TransferManager getS3TransferManager() { + if (s3TransferManager == null) { + s3Client = s3AsyncClient(); + s3TransferManager = S3TransferManager.builder().s3Client(s3Client).build(); + } + return s3TransferManager; + } + + private void downloadFileS3(String s3Url, Path localPath) throws IOException { + String withoutScheme = s3Url.substring(5); + int slashIdx = withoutScheme.indexOf('/'); + String bucket = withoutScheme.substring(0, slashIdx); + String key = withoutScheme.substring(slashIdx + 1); + + S3TransferManager tm = getS3TransferManager(); + + DownloadFileRequest request = DownloadFileRequest.builder() + .getObjectRequest(b -> b.bucket(bucket).key(key)) + .addTransferListener(LoggingTransferListener.create()) + .destination(localPath) + .build(); + + boolean downloaded = false; + for (int i = 0; i < 3; i++) { // 3 retries + try { + FileDownload downloadFile = tm.downloadFile(request); + CompletedFileDownload result = downloadFile.completionFuture().join(); + long downloadedSize = Files.size(localPath); + Long expectedSize = result.response().contentLength(); + + // Null check prevents NullPointerException during unboxing. + // If expectedSize is null, we trust the transfer manager's successful completion. + if (expectedSize != null && downloadedSize != expectedSize) { + logger.error("Incomplete download (got {} of {} bytes). Retrying...", downloadedSize, expectedSize); + Files.deleteIfExists(localPath); + continue; + } + + downloaded = true; + break; + } catch (Exception e) { + logger.error("Download attempt {} failed for {}: {}", i + 1, redact(key), redact(e.getMessage())); + Files.deleteIfExists(localPath); + } + } + if (!downloaded) { + throw new IOException("Failed to download " + redact(s3Url) + " after 3 attempts"); + } + } + + private static S3AsyncClient s3AsyncClient() { + return S3AsyncClient.crtBuilder() + .region(Region.US_EAST_1) + .credentialsProvider(AnonymousCredentialsProvider.create()) + .targetThroughputInGbps(10.0) + .minimumPartSizeInBytes(8L * 1024 * 1024) + .build(); + } + + // ======================================================================================== + // HTTP CLIENT IMPLEMENTATION + // ======================================================================================== + + private void downloadFileHttp(String url, Path localPath) throws IOException { + var request = HttpRequest.newBuilder().uri(URI.create(url)).GET().build(); + + Path targetDir = localPath.toAbsolutePath().getParent(); + if (targetDir != null) { + Files.createDirectories(targetDir); + } + Path tempFile = Files.createTempFile(targetDir, "download-", ".tmp"); + + try { + var response = httpClient.send(request, HttpResponse.BodyHandlers.ofFile(tempFile)); + if (response.statusCode() != 200) { + throw new IOException("HTTP " + response.statusCode() + " downloading " + redact(url)); + } + Files.move(tempFile, localPath, StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + Files.deleteIfExists(tempFile); + throw new IOException("Interrupted downloading " + redact(url), e); + } catch (Exception e) { + Files.deleteIfExists(tempFile); + throw e; + } + } +} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java index 93ace9249..3207e492f 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java @@ -21,6 +21,9 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.HashMap; import java.util.Map; import java.util.Optional; @@ -28,7 +31,7 @@ /// Reads dataset metadata from a multi-entry YAML file and provides keyed lookups /// for {@link DataSetProperties}. /// -/// This is used by loaders such as {@link DataSetLoaderMFD} and {@link DataSetLoaderHDF5} +/// This is used by loaders such as {@link DataSetLoaderSimpleMFD} /// that do not have an intrinsic way to determine the similarity function from the dataset /// name or file format alone. /// @@ -48,7 +51,8 @@ /// the exact key first, then falls back to the key with {@code .hdf5} appended. public class DataSetMetadataReader { - private static final String DEFAULT_FILE = "jvector-examples/yaml-configs/dataset_metadata.yml"; + private static final String DEFAULT_FILE = "jvector-examples/yaml-configs/dataset-metadata.yml"; + private static final String MODULE_RELATIVE_DEFAULT_FILE = "yaml-configs/dataset-metadata.yml"; private final Map> metadata; @@ -56,12 +60,26 @@ private DataSetMetadataReader(Map> metadata) { this.metadata = metadata != null ? metadata : Map.of(); } - /// Loads dataset metadata from the default file ({@code jvector-examples/yaml-configs/dataset_metadata.yml}). + /// Loads dataset metadata from the default file ({@code jvector-examples/yaml-configs/dataset-metadata.yml}). /// /// @return the loaded metadata /// @throws RuntimeException if the file cannot be read public static DataSetMetadataReader load() { - return load(DEFAULT_FILE); + Path defaultPath = Paths.get(DEFAULT_FILE); + if (Files.isRegularFile(defaultPath)) { + return load(defaultPath.toString()); + } + + Path moduleRelativePath = Paths.get(MODULE_RELATIVE_DEFAULT_FILE); + if (Files.isRegularFile(moduleRelativePath)) { + return load(moduleRelativePath.toString()); + } + + throw new RuntimeException( + "Failed to load dataset metadata from default locations: " + + defaultPath.toAbsolutePath().normalize() + + " or " + + moduleRelativePath.toAbsolutePath().normalize()); } /// Loads dataset metadata from the specified file. diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java index 5ae1cf2e6..6d017d899 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java @@ -150,7 +150,7 @@ default boolean isValid() { /// )); /// /// // From a YAML file, selecting a named entry - /// var props = new DataSetProperties.PropertyMap("dataset_metadata.yml", "ada002-100k"); + /// var props = new DataSetProperties.PropertyMap("dataset-metadata.yml", "ada002-100k"); /// /// // From a flat YAML file (no top-level key) /// var props = new DataSetProperties.PropertyMap("my_dataset.yml", null); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java index 449ff4fc6..94b2e1c77 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java @@ -37,8 +37,15 @@ public class DataSets { private static final Logger logger = LoggerFactory.getLogger(DataSets.class); public static final List defaultLoaders = new ArrayList<>() {{ - add(new DataSetLoaderHDF5()); - add(new DataSetLoaderMFD()); + + /// Scans the jvector-examples/yaml-configs/dataset-catalogs/ directory for .yaml/.yml files. + /// + /// To add your own datasets: + /// 1. Add a .yaml file with your dataset mappings (see local-catalog.yaml for examples) + /// 2. For private remote datasets, use baseurl with ${SECRET_HASH} style env vars + /// + add(new DataSetLoaderSimpleMFD("jvector-examples/yaml-configs/dataset-catalogs")); + }}; /// Loads a dataset by name using the {@link #defaultLoaders}. diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunArtifacts.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunArtifacts.java index 77e180f4f..5cb2ebcde 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunArtifacts.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunArtifacts.java @@ -17,7 +17,6 @@ package io.github.jbellis.jvector.example.reporting; import io.github.jbellis.jvector.example.benchmarks.datasets.DataSet; -import io.github.jbellis.jvector.example.benchmarks.datasets.DataSetLoaderMFD; import io.github.jbellis.jvector.example.benchmarks.Metric; import io.github.jbellis.jvector.example.yaml.MultiConfig; import io.github.jbellis.jvector.example.yaml.MetricSelection; @@ -39,7 +38,7 @@ * - sys_info.json (RunReporting) * - dataset_info.csv (DatasetInfoWriter) * - experiments.csv (ExperimentsCsvWriter) - * - run-level compute/display/log selections from run.yml + * - run-level compute/display/log selections from run-config.yml */ public final class RunArtifacts { @@ -242,17 +241,6 @@ public void registerDataset(String datasetName, DataSet ds) throws IOException { return; // disabled } - var mfd = DataSetLoaderMFD.MultiFileDatasource.byName.get(datasetName); - - String basePath = ""; - String queryPath = ""; - String gtPath = ""; - if (mfd != null) { - basePath = Paths.get("fvec").resolve(mfd.basePath).toAbsolutePath().toString(); - queryPath = Paths.get("fvec").resolve(mfd.queriesPath).toAbsolutePath().toString(); - gtPath = Paths.get("fvec").resolve(mfd.groundTruthPath).toAbsolutePath().toString(); - } - - datasetInfoWriter.register(DatasetInfoWriter.fromDataSet(datasetName, basePath, queryPath, gtPath, ds)); + datasetInfoWriter.register(DatasetInfoWriter.fromDataSet(datasetName, "", "", "", ds)); } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunReporting.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunReporting.java index 5d6dca98c..9cb21e24a 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunReporting.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunReporting.java @@ -29,7 +29,7 @@ /** * Bootstraps a benchmark run directory and writes sys_info.json. * - * This class creates a run_id/run_uuid, selects the logging directory from run.yml, captures basic + * This class creates a run_id/run_uuid, selects the logging directory from run-config.yml, captures basic * environment metadata (OS/JVM/CPU/SIMD/threads/memory), computes a stable system_id, and returns a * {@link RunContext} for downstream writers (dataset_info.csv, experiments.csv). */ diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/SearchSelection.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/SearchSelection.java index fbe3e96c4..814c862b2 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/SearchSelection.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/SearchSelection.java @@ -26,7 +26,7 @@ * Encapsulates selection + resolution + warning + application for a single sink (console/logging) * in the search phase, using {@link ReportingSelectionResolver} and {@link SearchReportingCatalog}. * - * Selections are typically run-level (from run.yml via {@link io.github.jbellis.jvector.example.yaml.RunConfig}). + * Selections are typically run-level (from run-config.yml via {@link io.github.jbellis.jvector.example.yaml.RunConfig}). * * This prevents call-site ordering mistakes (validate -> resolve -> warn -> apply). */ diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/MultiConfig.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/MultiConfig.java index 0f2b79805..6e56a6e26 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/MultiConfig.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/MultiConfig.java @@ -37,7 +37,7 @@ public class MultiConfig { public SearchParameters search; public String dataset; - private static final String defaultDirectory = "jvector-examples/yaml-configs/"; + private static final String defaultDirectory = "jvector-examples/yaml-configs/index-parameters/"; private static final java.util.regex.Pattern YAML_SCHEMA_VERSION_KEY = java.util.regex.Pattern.compile("(?m)^\\s*yamlSchemaVersion\\s*:"); @@ -127,7 +127,7 @@ static MultiConfig getConfig(File configFile) throws FileNotFoundException { // Legacy yamlSchemaVersion "0": lenient parsing (ignore unknown fields like search.benchmarks) if (WARNED_LEGACY.compareAndSet(false, true)) { System.err.println("WARNING: Deprecated legacy YAML schema detected (no yamlSchemaVersion). " - + "Unknown fields will be ignored. Please migrate configs to yamlSchemaVersion: 1 and run.yml."); + + "Unknown fields will be ignored. Please migrate configs to yamlSchemaVersion: 1 and run-config.yml."); } try { diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/RunConfig.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/RunConfig.java index b65dda082..8711007aa 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/RunConfig.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/RunConfig.java @@ -26,7 +26,7 @@ import java.util.Map; /** - * Run-level configuration loaded from yaml-configs/run.yml. + * Run-level configuration loaded from yaml-configs/run-config.yml. * * This controls: * - benchmark computation (benchmarks) @@ -36,7 +36,7 @@ */ public class RunConfig { private static final String defaultDirectory = "jvector-examples/yaml-configs/"; - private static final String defaultRunFile = "run.yml"; + private static final String defaultRunFile = "run-config.yml"; public int yamlSchemaVersion; public int onDiskIndexVersion; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/SearchParameters.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/SearchParameters.java index e636a3ce9..dabce268a 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/SearchParameters.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/SearchParameters.java @@ -23,6 +23,6 @@ public class SearchParameters extends CommonParameters { public Map> topKOverquery; public List useSearchPruning; - // NOTE: benchmark compute + console/logging selection are now run-level (run.yml) + // NOTE: benchmark compute + console/logging selection are now run-level (run-config.yml) // and are no longer recognized in dataset configs. } diff --git a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderSimpleMFDTest.java b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderSimpleMFDTest.java new file mode 100644 index 000000000..33379dd57 --- /dev/null +++ b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderSimpleMFDTest.java @@ -0,0 +1,1514 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.jbellis.jvector.example.benchmarks.datasets; + +import com.sun.net.httpserver.HttpExchange; +import com.sun.net.httpserver.HttpServer; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.io.IOException; +import java.io.OutputStream; +import java.net.InetSocketAddress; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.file.Files; +import java.nio.file.Path; + +import static org.junit.jupiter.api.Assertions.*; + +/// Tests for {@link DataSetLoaderSimpleMFD} using local files only, with no remote endpoint. +public class DataSetLoaderSimpleMFDTest { + + @Rule + public TemporaryFolder tempFolder = new TemporaryFolder(); + + private Path cacheDir; + private DataSetMetadataReader testMetadata; + + /// Returns the name of an environment variable that is reliably set on all platforms. + /// On Unix this is typically HOME; on Windows it is typically USERPROFILE or PATH. + private static String findReliableEnvVar() { + for (String name : new String[] {"HOME", "USERPROFILE", "PATH"}) { + if (System.getenv(name) != null) return name; + } + throw new AssertionError("Could not find any set environment variable for testing"); + } + + @Before + public void setUp() throws IOException { + cacheDir = tempFolder.newFolder("datasets").toPath(); + + // create a test-only metadata file + Path metadataFile = tempFolder.newFile("test_metadata.yml").toPath(); + Files.writeString(metadataFile, + "test-ds:\n" + + " similarity_function: COSINE\n" + + " load_behavior: NO_SCRUB\n" + + "sub-ds:\n" + + " similarity_function: COSINE\n" + + " load_behavior: NO_SCRUB\n" + + "private-ds:\n" + + " similarity_function: DOT_PRODUCT\n" + + " load_behavior: NO_SCRUB\n"); + testMetadata = DataSetMetadataReader.load(metadataFile.toString()); + } + + // ======================================================================== + // Basic loading + // ======================================================================== + + @Test + public void loadsDatasetFromLocalCatalogAndFiles() throws IOException { + writeTestCatalog(cacheDir); + writeTestDataFiles(cacheDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + var info = loader.loadDataSet("test-ds"); + assertTrue(info.isPresent(), "Dataset should be found in local catalog"); + assertEquals("test-ds", info.get().getName()); + + var ds = info.get().getDataSet(); + assertEquals(5, ds.getBaseVectors().size()); + assertEquals(2, ds.getQueryVectors().size()); + assertEquals(2, ds.getGroundTruth().size()); + assertEquals(4, ds.getDimension()); + } + + @Test + public void returnsEmptyForUnknownDataset() throws IOException { + writeTestCatalog(cacheDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + assertFalse(loader.loadDataSet("nonexistent-dataset").isPresent()); + } + + @Test + public void failsWhenLocalFilesMissing() throws IOException { + writeTestCatalog(cacheDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + var ex = assertThrows(RuntimeException.class, () -> loader.loadDataSet("test-ds")); + assertTrue(ex.getCause().getMessage().contains("no remote URL configured"), + "Error should indicate no remote is available: " + ex.getCause().getMessage()); + } + + @Test + public void failsWhenNoCatalogAndRemoteUnreachable() { + assertThrows(RuntimeException.class, () -> new DataSetLoaderSimpleMFD( + "http://0.0.0.0:1/catalog_entries.yaml", + cacheDir.toString(), false, testMetadata + )); + } + + @Test + public void checkForUpdatesDoesNotFailWhenRemoteUnreachable() throws IOException { + writeTestCatalog(cacheDir); + writeTestDataFiles(cacheDir); + + // should not throw — logs a warning but proceeds with the local catalog + var loader = new DataSetLoaderSimpleMFD( + "http://0.0.0.0:1/catalog_entries.yaml", + cacheDir.toString(), true, testMetadata + ); + + assertTrue(loader.loadDataSet("test-ds").isPresent()); + } + + @Test + public void rejectsCatalogWithMissingFields() throws IOException { + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "bad-ds:\n base: b.fvecs\n query: q.fvecs\n"); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + assertFalse(loader.loadDataSet("bad-ds").isPresent(), + "Should return empty for dataset with missing catalog fields"); + } + + @Test + public void unknownFieldThrows() throws IOException { + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "test-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n" + + " similarity: COSINE\n"); + + var ex = assertThrows(IllegalArgumentException.class, () -> + new DataSetLoaderSimpleMFD(null, cacheDir.toString(), false, testMetadata) + ); + assertTrue(ex.getMessage().contains("similarity"), + "Error should name the unknown field: " + ex.getMessage()); + } + + @Test + public void unknownFieldInDefaultsThrows() throws IOException { + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "_defaults:\n" + + " typo_field: some_value\n" + + "test-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + + var ex = assertThrows(IllegalArgumentException.class, () -> + new DataSetLoaderSimpleMFD(null, cacheDir.toString(), false, testMetadata) + ); + assertTrue(ex.getMessage().contains("typo_field"), + "Error should name the unknown field: " + ex.getMessage()); + } + + // ======================================================================== + // Local path resolution (file vs directory) + // ======================================================================== + + @Test + public void loadsWithLocalPathAsYamlFile() throws IOException { + writeTestCatalog(cacheDir); + writeTestDataFiles(cacheDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.resolve("catalog_entries.yaml").toString(), + false, testMetadata + ); + + var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet(); + assertEquals(5, ds.getBaseVectors().size()); + } + + @Test + public void singleArgConstructorWithFilePath() throws IOException { + writeTestCatalog(cacheDir); + + var loader = new DataSetLoaderSimpleMFD( + cacheDir.resolve("catalog_entries.yaml").toString() + ); + + // can't call getDataSet() (no test-ds in production metadata), but catalog is loaded + assertFalse(loader.loadDataSet("nonexistent").isPresent()); + } + + @Test + public void singleArgConstructorWithDirectory() throws IOException { + writeTestCatalog(cacheDir); + + var loader = new DataSetLoaderSimpleMFD(cacheDir.toString()); + assertFalse(loader.loadDataSet("nonexistent").isPresent()); + } + + @Test + public void singleArgConstructorMissingCatalogReturnsEmpty() { + var loader = new DataSetLoaderSimpleMFD(cacheDir.toString()); + assertFalse(loader.loadDataSet("test-ds").isPresent()); + } + + // ======================================================================== + // Null / empty catalog URL (local-only mode) + // ======================================================================== + + @Test + public void nullCatalogUrlWorksWithLocalCatalog() throws IOException { + writeTestCatalog(cacheDir); + writeTestDataFiles(cacheDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet(); + assertEquals(5, ds.getBaseVectors().size()); + } + + @Test + public void emptyCatalogUrlWorksWithLocalCatalog() throws IOException { + writeTestCatalog(cacheDir); + writeTestDataFiles(cacheDir); + + var loader = new DataSetLoaderSimpleMFD( + "", cacheDir.toString(), false, testMetadata + ); + + var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet(); + assertEquals(5, ds.getBaseVectors().size()); + } + + @Test + public void nullCatalogUrlWithoutLocalCatalogReturnsEmpty() { + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + assertFalse(loader.loadDataSet("test-ds").isPresent()); + } + + @Test + public void nullCatalogUrlIgnoresCheckForUpdates() throws IOException { + writeTestCatalog(cacheDir); + writeTestDataFiles(cacheDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), true, testMetadata + ); + assertTrue(loader.loadDataSet("test-ds").isPresent()); + } + + // ======================================================================== + // catalogUrl remote catalog loading + // ======================================================================== + + @Test + public void catalogUrlFetchesRemoteCatalogWhenNoLocalCatalogExists() throws IOException { + Path remoteDir = tempFolder.newFolder("remote-catalog").toPath(); + Path remoteCacheDir = tempFolder.newFolder("remote-cache").toPath(); + + Files.writeString(remoteDir.resolve("catalog_entries.yaml"), + "_defaults:\n" + + " cache_dir: " + remoteCacheDir + "\n" + + "test-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(remoteDir); + + assertFalse(Files.exists(cacheDir.resolve("catalog_entries.yaml")), + "Precondition: local catalog should not exist"); + + HttpServer server = startFileServer(remoteDir); + try { + var loader = new DataSetLoaderSimpleMFD( + urlFor(server, "catalog_entries.yaml"), + cacheDir.toString(), false, testMetadata + ); + + // remote catalog should be cached locally + assertTrue(Files.exists(cacheDir.resolve("catalog_entries.yaml"))); + + var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet(); + assertEquals(5, ds.getBaseVectors().size()); + assertEquals(2, ds.getQueryVectors().size()); + assertEquals(2, ds.getGroundTruth().size()); + assertEquals(4, ds.getDimension()); + + // dataset files should be downloaded using the remote catalog's base path + assertTrue(Files.exists(remoteCacheDir.resolve("test_base.fvecs"))); + assertTrue(Files.exists(remoteCacheDir.resolve("test_query.fvecs"))); + assertTrue(Files.exists(remoteCacheDir.resolve("test_gt.ivecs"))); + } finally { + server.stop(0); + } + } + + @Test + public void catalogUrlDoesNotMergeRemoteCatalogWhenLocalCatalogExists() throws IOException { + // local catalog should win; remote catalog is only used for update checks in this mode + writeTestCatalog(cacheDir); + writeTestDataFiles(cacheDir); + + Path remoteDir = tempFolder.newFolder("remote-catalog").toPath(); + Files.writeString(remoteDir.resolve("catalog_entries.yaml"), + "sub-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(remoteDir); + + HttpServer server = startFileServer(remoteDir); + try { + var loader = new DataSetLoaderSimpleMFD( + urlFor(server, "catalog_entries.yaml"), + cacheDir.toString(), false, testMetadata + ); + + assertTrue(loader.loadDataSet("test-ds").isPresent(), + "Local catalog entry should be found"); + assertFalse(loader.loadDataSet("sub-ds").isPresent(), + "Remote catalog should not be merged when a local catalog exists"); + } finally { + server.stop(0); + } + } + + // ======================================================================== + // Comment-only / empty catalog files + // ======================================================================== + + @Test + public void commentOnlyCatalogFileReturnsEmpty() throws IOException { + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "# This file has no actual entries\n# Just comments\n"); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + assertFalse(loader.loadDataSet("test-ds").isPresent()); + } + + @Test + public void emptyCatalogFileReturnsEmpty() throws IOException { + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), ""); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + assertFalse(loader.loadDataSet("test-ds").isPresent()); + } + + @Test + public void singleArgWithCommentOnlyCatalogReturnsEmpty() throws IOException { + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "# placeholder for local datasets\n"); + + var loader = new DataSetLoaderSimpleMFD( + cacheDir.resolve("catalog_entries.yaml").toString() + ); + assertFalse(loader.loadDataSet("anything").isPresent()); + } + + // ======================================================================== + // Recursive catalog discovery + // ======================================================================== + + @Test + public void recursivelyDiscoversCatalogs() throws IOException { + // root catalog + writeTestCatalog(cacheDir); + writeTestDataFiles(cacheDir); + + // subdirectory catalog with a different dataset + Path subDir = cacheDir.resolve("subgroup"); + Files.createDirectories(subDir); + Files.writeString(subDir.resolve("catalog_entries.yaml"), + "sub-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(subDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + // both datasets should be found + assertTrue(loader.loadDataSet("test-ds").isPresent(), "Root catalog entry should be found"); + assertTrue(loader.loadDataSet("sub-ds").isPresent(), "Subdirectory catalog entry should be found"); + } + + @Test + public void subdirectoryDataFilesResolveRelativeToTheirCatalog() throws IOException { + // root has no data files, subdirectory has both catalog and data + Path subDir = cacheDir.resolve("subgroup"); + Files.createDirectories(subDir); + Files.writeString(subDir.resolve("catalog_entries.yaml"), + "sub-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(subDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + // data files should resolve relative to subDir, not cacheDir + var ds = loader.loadDataSet("sub-ds").orElseThrow().getDataSet(); + assertEquals(5, ds.getBaseVectors().size()); + } + + @Test + public void duplicateEntryAcrossCatalogsDoesNotFail() throws IOException { + // root catalog defines test-ds + writeTestCatalog(cacheDir); + writeTestDataFiles(cacheDir); + + // subdirectory also defines test-ds — one wins (walk order is unspecified) + Path subDir = cacheDir.resolve("override"); + Files.createDirectories(subDir); + Files.writeString(subDir.resolve("catalog_entries.yaml"), + "test-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(subDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + // should load without error — whichever catalog wins, the dataset is valid + var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet(); + assertNotNull(ds); + assertEquals(5, ds.getBaseVectors().size()); + } + + @Test + public void discoversAlternativeCatalogFilenames() throws IOException { + // entries.yaml in root + Files.writeString(cacheDir.resolve("entries.yaml"), + "test-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(cacheDir); + + // private_entries.yaml in subdirectory + Path subDir = cacheDir.resolve("private"); + Files.createDirectories(subDir); + Files.writeString(subDir.resolve("private_entries.yaml"), + "sub-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(subDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + assertTrue(loader.loadDataSet("test-ds").isPresent(), "root yaml should be discovered"); + assertTrue(loader.loadDataSet("sub-ds").isPresent(), "subdirectory yaml should be discovered"); + } + + @Test + public void ignoresNonYamlFiles() throws IOException { + // .json and .txt files should not be picked up + Files.writeString(cacheDir.resolve("datasets.json"), + "{\"test-ds\": {\"base\": \"test_base.fvecs\"}}"); + Files.writeString(cacheDir.resolve("readme.txt"), + "test-ds:\n base: test_base.fvecs\n query: test_query.fvecs\n gt: test_gt.ivecs\n"); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + assertFalse(loader.loadDataSet("test-ds").isPresent(), + "Non-YAML files should be ignored"); + } + + @Test + public void anyYamlFileIsDiscovered() throws IOException { + // any .yaml file should be picked up, not just *entries.yaml + Files.writeString(cacheDir.resolve("my_datasets.yaml"), + "test-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(cacheDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + assertTrue(loader.loadDataSet("test-ds").isPresent(), + "Any .yaml file should be discovered"); + } + + @Test + public void ymlExtensionAlsoDiscovered() throws IOException { + Files.writeString(cacheDir.resolve("datasets.yml"), + "test-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(cacheDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + assertTrue(loader.loadDataSet("test-ds").isPresent(), + ".yml files should also be discovered"); + } + + // ======================================================================== + // Per-entry base_url override + // ======================================================================== + + @Test + public void base_urlOverrideIsUsedForDownload() throws IOException { + // catalog entry has a base_url pointing to an unreachable server + // but the files exist locally — so base_url is not actually hit. + // This test verifies the entry is parsed correctly and local files still resolve. + Path subDir = cacheDir.resolve("private"); + Files.createDirectories(subDir); + Files.writeString(subDir.resolve("catalog_entries.yaml"), + "private-ds:\n" + + " base_url: http://0.0.0.0:1/secret-hash/\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(subDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + var ds = loader.loadDataSet("private-ds").orElseThrow().getDataSet(); + assertEquals(5, ds.getBaseVectors().size()); + } + + @Test + public void base_urlOverrideFailsWhenLocalFilesMissingAndRemoteUnreachable() throws IOException { + // catalog entry has a base_url pointing to unreachable server, and no local data files + Path subDir = cacheDir.resolve("private"); + Files.createDirectories(subDir); + Files.writeString(subDir.resolve("catalog_entries.yaml"), + "private-ds:\n" + + " base_url: http://0.0.0.0:1/secret-hash/\n" + + " base: missing_base.fvecs\n" + + " query: missing_query.fvecs\n" + + " gt: missing_gt.ivecs\n"); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + // should fail because files don't exist and base_url is unreachable + assertThrows(RuntimeException.class, () -> loader.loadDataSet("private-ds")); + } + + @Test + public void base_urlWithoutTrailingSlashIsNormalized() throws IOException { + Path subDir = cacheDir.resolve("private"); + Files.createDirectories(subDir); + // base_url without trailing slash + Files.writeString(subDir.resolve("catalog_entries.yaml"), + "private-ds:\n" + + " base_url: http://0.0.0.0:1/no-trailing-slash\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(subDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + // should load fine — base_url is normalized with trailing slash + var ds = loader.loadDataSet("private-ds").orElseThrow().getDataSet(); + assertEquals(5, ds.getBaseVectors().size()); + } + + @Test + public void subdirectoryPathsInFileValuesResolveCorrectly() throws IOException { + // mirrors the real large_dataset_entries.yaml structure where file values + // contain subdirectory paths like "dpr/c4-en_base_1M_norm.fvecs" + Path privateDir = cacheDir.resolve("jvector_private"); + Files.createDirectories(privateDir); + Files.writeString(privateDir.resolve("large_dataset_entries.yaml"), + "test-ds:\n" + + " base_url: http://0.0.0.0:1/secret-hash/\n" + + " base: subdir/test_base.fvecs\n" + + " query: subdir/test_query.fvecs\n" + + " gt: subdir/test_gt.ivecs\n"); + + // create data files in the subdirectory under the catalog's directory + Path dataDir = privateDir.resolve("subdir"); + Files.createDirectories(dataDir); + writeTestDataFiles(dataDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet(); + assertEquals(5, ds.getBaseVectors().size()); + assertEquals(4, ds.getDimension()); + } + + @Test + public void subdirectoryPathsDownloadWhenLocalMissing() throws IOException { + // catalog has subdirectory paths but files are missing locally and remote is unreachable + // — should fail with a clear error mentioning the base_url, not the default remote + Path privateDir = cacheDir.resolve("jvector_private"); + Files.createDirectories(privateDir); + Files.writeString(privateDir.resolve("large_dataset_entries.yaml"), + "test-ds:\n" + + " base_url: http://0.0.0.0:1/secret-hash/\n" + + " base: subdir/missing_base.fvecs\n" + + " query: subdir/missing_query.fvecs\n" + + " gt: subdir/missing_gt.ivecs\n"); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + // should attempt to download from the base_url and fail + assertThrows(RuntimeException.class, () -> loader.loadDataSet("test-ds")); + } + + // ======================================================================== + // _defaults and _-prefix exclusion + // ======================================================================== + + @Test + public void defaultsAreFoldedIntoEntries() throws IOException { + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "_defaults:\n" + + " base_url: http://0.0.0.0:1/default-path/\n" + + "test-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(cacheDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + // files exist locally so base_url isn't hit, but the entry should load fine + var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet(); + assertEquals(5, ds.getBaseVectors().size()); + } + + @Test + public void entryOverridesDefaults() throws IOException { + // _defaults sets base_url, but the entry overrides it + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "_defaults:\n" + + " base_url: http://0.0.0.0:1/should-be-overridden/\n" + + "test-ds:\n" + + " base_url: http://0.0.0.0:2/entry-specific/\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(cacheDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet(); + assertEquals(5, ds.getBaseVectors().size()); + } + + @Test + public void underscorePrefixedKeysAreExcluded() throws IOException { + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "_defaults:\n" + + " base_url: http://0.0.0.0:1/x/\n" + + "_internal:\n" + + " base: should_not_appear.fvecs\n" + + "test-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(cacheDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + assertFalse(loader.loadDataSet("_defaults").isPresent(), "_defaults should not be a dataset"); + assertFalse(loader.loadDataSet("_internal").isPresent(), "_internal should not be a dataset"); + assertTrue(loader.loadDataSet("test-ds").isPresent(), "test-ds should be found"); + } + + // ======================================================================== + // cache_dir + // ======================================================================== + + @Test + public void cacheDirOverridesLocalDir() throws IOException { + // catalog is in cacheDir, but cache_dir points to a separate location + Path customCache = tempFolder.newFolder("custom-cache").toPath(); + writeTestDataFiles(customCache); + + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "test-ds:\n" + + " cache_dir: " + customCache.toString() + "\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + // note: NO data files in cacheDir — they're in customCache + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet(); + assertEquals(5, ds.getBaseVectors().size()); + } + + @Test + public void cacheDirFromDefaults() throws IOException { + Path customCache = tempFolder.newFolder("default-cache").toPath(); + writeTestDataFiles(customCache); + + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "_defaults:\n" + + " cache_dir: " + customCache.toString() + "\n" + + "test-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet(); + assertEquals(5, ds.getBaseVectors().size()); + } + + @Test + public void cacheDirEntryOverridesDefault() throws IOException { + Path defaultCache = tempFolder.newFolder("default-cache").toPath(); + Path entryCache = tempFolder.newFolder("entry-cache").toPath(); + writeTestDataFiles(entryCache); + // note: defaultCache has NO data files + + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "_defaults:\n" + + " cache_dir: " + defaultCache.toString() + "\n" + + "test-ds:\n" + + " cache_dir: " + entryCache.toString() + "\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet(); + assertEquals(5, ds.getBaseVectors().size()); + } + + @Test + public void nonExistentCacheDirIsAutoCreatedOnDownloadAttempt() throws IOException { + // cache_dir points to a directory that doesn't exist yet + Path newCacheDir = cacheDir.resolve("auto-created-subdir"); + assertFalse(Files.exists(newCacheDir), "Precondition: dir should not exist yet"); + + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "test-ds:\n" + + " cache_dir: " + newCacheDir + "\n" + + " base_url: http://0.0.0.0:1/unreachable/\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + // download will fail (unreachable), but the directory should have been created + assertThrows(RuntimeException.class, () -> loader.loadDataSet("test-ds")); + assertTrue(Files.isDirectory(newCacheDir), + "cache_dir should be auto-created before download is attempted"); + } + + @Test + public void nonExistentCacheDirWithSubpathIsAutoCreated() throws IOException { + // cache_dir doesn't exist, and filenames contain subdirectories + Path newCacheDir = cacheDir.resolve("deep/nested/cache"); + assertFalse(Files.exists(newCacheDir)); + + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "test-ds:\n" + + " cache_dir: " + newCacheDir + "\n" + + " base_url: http://0.0.0.0:1/unreachable/\n" + + " base: subdir/test_base.fvecs\n" + + " query: subdir/test_query.fvecs\n" + + " gt: subdir/test_gt.ivecs\n"); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + assertThrows(RuntimeException.class, () -> loader.loadDataSet("test-ds")); + // both cache_dir and the subdir should be created + assertTrue(Files.isDirectory(newCacheDir.resolve("subdir")), + "cache_dir and subdirectory should be auto-created"); + } + + @Test + public void nonExistentCacheDirWithLocalFilesPrePopulated() throws IOException { + // cache_dir is auto-created, and files are placed there before loading + Path newCacheDir = cacheDir.resolve("fresh-cache"); + + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "test-ds:\n" + + " cache_dir: " + newCacheDir + "\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + + // pre-create and populate — simulates a previous download + Files.createDirectories(newCacheDir); + writeTestDataFiles(newCacheDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet(); + assertEquals(5, ds.getBaseVectors().size()); + } + + // ======================================================================== + // ${VAR} expansion + // ======================================================================== + + @Test + public void envVarExpandedInBaseurl() throws IOException { + String envName = findReliableEnvVar(); + + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "test-ds:\n" + + " base_url: http://0.0.0.0:1/${" + envName + "}/path/\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(cacheDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + // files exist locally so the expanded base_url isn't hit, but parsing should succeed + var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet(); + assertEquals(5, ds.getBaseVectors().size()); + } + + @Test + public void envVarExpandedInCacheDir() throws IOException { + Path customCache = tempFolder.newFolder("env-cache").toPath(); + writeTestDataFiles(customCache); + + // verify that the ${} syntax is expanded without error + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "test-ds:\n" + + " cache_dir: " + customCache.toString() + "\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet(); + assertEquals(5, ds.getBaseVectors().size()); + } + + @Test + public void envVarExpandedInDefaults() throws IOException { + String envName = findReliableEnvVar(); + + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "_defaults:\n" + + " base_url: http://0.0.0.0:1/${" + envName + "}/\n" + + "test-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(cacheDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet(); + assertEquals(5, ds.getBaseVectors().size()); + } + + @Test + public void undefinedEnvVarThrows() throws IOException { + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "test-ds:\n" + + " base_url: s3://bucket/${JVECTOR_NONEXISTENT_VAR_12345}/\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + + var ex = assertThrows(IllegalArgumentException.class, () -> + new DataSetLoaderSimpleMFD(null, cacheDir.toString(), false, testMetadata) + ); + assertTrue(ex.getMessage().contains("JVECTOR_NONEXISTENT_VAR_12345"), + "Error should name the missing variable: " + ex.getMessage()); + } + + @Test + public void envVarWithDefaultUsesDefault() throws IOException { + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "test-ds:\n" + + " base_url: http://0.0.0.0:1/${JVECTOR_NONEXISTENT_12345:-fallback-path}/\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(cacheDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet(); + assertEquals(5, ds.getBaseVectors().size()); + } + + @Test + public void envVarWithDefaultPrefersEnvWhenSet() throws IOException { + String envName = findReliableEnvVar(); + + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "test-ds:\n" + + " base_url: http://0.0.0.0:1/${" + envName + ":-not-used}/\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(cacheDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + assertTrue(loader.loadDataSet("test-ds").isPresent()); + } + + @Test + public void envVarWithEmptyDefault() throws IOException { + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "test-ds:\n" + + " base_url: http://0.0.0.0:1/${JVECTOR_NONEXISTENT_12345:-}/data/\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(cacheDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + assertTrue(loader.loadDataSet("test-ds").isPresent()); + } + + @Test + public void multipleEnvVarsExpanded() throws IOException { + String envName = findReliableEnvVar(); + + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "test-ds:\n" + + " base_url: http://0.0.0.0:1/${" + envName + "}/${" + envName + "}/\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(cacheDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + assertTrue(loader.loadDataSet("test-ds").isPresent()); + } + + // ======================================================================== + // _include directive + // ======================================================================== + + @Test + public void includeWithUnreachableRemoteWarnsButDoesNotFail() throws IOException { + // _include points to an unreachable URL — should log a warning, not crash + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "_include:\n" + + " url: http://0.0.0.0:1/nonexistent/catalog_entries.yaml\n" + + "test-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(cacheDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + // local entry should still work + var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet(); + assertEquals(5, ds.getBaseVectors().size()); + } + + @Test + public void includeWithUnreachableRemoteAndNoLocalEntriesReturnsEmpty() throws IOException { + // _include only, no local entries, remote unreachable — empty catalog, no crash + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "_include:\n" + + " url: http://0.0.0.0:1/nonexistent/catalog_entries.yaml\n"); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + assertFalse(loader.loadDataSet("anything").isPresent()); + } + + @Test + public void includeWithMissingUrlFieldIsIgnored() throws IOException { + // _include exists but has no url field — should be silently ignored + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "_include:\n" + + " description: this has no url\n" + + "test-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(cacheDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet(); + assertEquals(5, ds.getBaseVectors().size()); + } + + @Test + public void localEntryOverridesIncludedEntry() throws IOException { + // simulate: _include would bring in "test-ds" from remote, but local also defines it. + // Since _include fails (unreachable), only the local entry exists. This tests that + // local entries in the same file are processed after _include and thus override. + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "_include:\n" + + " url: http://0.0.0.0:1/remote/catalog_entries.yaml\n" + + "test-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(cacheDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + // local entry should work — the failed include shouldn't prevent it + var ds = loader.loadDataSet("test-ds").orElseThrow().getDataSet(); + assertEquals(5, ds.getBaseVectors().size()); + } + + @Test + public void includeDefaultsAppliedToIncludedEntries() throws IOException { + // _defaults in the local file should be applied to entries from _include. + // Since we can't hit a real remote in unit tests, we verify indirectly: + // the _defaults + _include combo should not crash even with unreachable remote. + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "_defaults:\n" + + " cache_dir: " + cacheDir.toString() + "\n" + + "_include:\n" + + " url: http://0.0.0.0:1/remote/catalog_entries.yaml\n"); + + // should not throw + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + assertFalse(loader.loadDataSet("anything").isPresent()); + } + + @Test + public void includeWithEnvVarInUrl() throws IOException { + String envName = findReliableEnvVar(); + + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "_include:\n" + + " url: http://0.0.0.0:1/${" + envName + "}/catalog_entries.yaml\n" + + "test-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(cacheDir); + + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + assertTrue(loader.loadDataSet("test-ds").isPresent()); + } + + @Test + public void includeWithDefaultValueInUrl() throws IOException { + // ${NONEXISTENT:-fallback} in _include url + Files.writeString(cacheDir.resolve("catalog_entries.yaml"), + "_include:\n" + + " url: http://0.0.0.0:1/${JVECTOR_NONEXISTENT_12345:-fallback}/catalog_entries.yaml\n" + + "test-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(cacheDir); + + // should not throw — the default value is used + var loader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + assertTrue(loader.loadDataSet("test-ds").isPresent()); + } + + // ======================================================================== + // Log redaction + // ======================================================================== + + @Test + public void redactReplacesLongHexHash() { + // SHA-256 hash (64 hex chars) + String url = "s3://bucket/6174752eb60168112f2edb38493782da2ebe5ae6bfc870e25ed1711205e5395d/dpr/file.fvecs"; + String redacted = DataSetLoaderSimpleMFD.redact(url); + assertFalse(redacted.contains("6174752e"), "Hash should be redacted: " + redacted); + assertTrue(redacted.contains("[[redacted]]"), "Should contain redaction marker: " + redacted); + assertTrue(redacted.contains("s3://bucket/"), "Bucket should be preserved: " + redacted); + assertTrue(redacted.contains("/dpr/file.fvecs"), "Non-hash path should be preserved: " + redacted); + } + + @Test + public void redactHandlesHashWithSuffix() { + // hash_private pattern + String url = "s3://bucket/6174752eb60168112f2edb38493782da2ebe5ae6bfc870e25ed1711205e5395d_private/data.fvecs"; + String redacted = DataSetLoaderSimpleMFD.redact(url); + assertFalse(redacted.contains("6174752e"), "Hash should be redacted: " + redacted); + assertTrue(redacted.contains("[[redacted]]")); + } + + @Test + public void redactHandlesHashWithDashes() { + // UUID-style or dashed key: a3f8b2c1-d4e5-f6a7-b8c9-d0e1f2a3b4c5 + String url = "s3://bucket/a3f8b2c1-d4e5-f6a7-b8c9-d0e1f2a3b4c5/data.fvecs"; + String redacted = DataSetLoaderSimpleMFD.redact(url); + assertFalse(redacted.contains("a3f8b2c1"), "Dashed hex should be redacted: " + redacted); + assertTrue(redacted.contains("[[redacted]]")); + } + + @Test + public void redactHandlesHashWithDots() { + // dotted hex token + String url = "https://host/a3f8b2c1.d4e5f6a7.b8c9d0e1.f2a3b4c5/data.fvecs"; + String redacted = DataSetLoaderSimpleMFD.redact(url); + assertFalse(redacted.contains("a3f8b2c1"), "Dotted hex should be redacted: " + redacted); + } + + @Test + public void redactHandles0xPrefix() { + String url = "s3://bucket/0xa3f8b2c1d4e5f6a7b8c9d0e1f2a3b4c5/data.fvecs"; + String redacted = DataSetLoaderSimpleMFD.redact(url); + assertFalse(redacted.contains("a3f8b2c1"), "0x-prefixed hex should be redacted: " + redacted); + assertTrue(redacted.contains("[[redacted]]")); + } + + @Test + public void redactPreservesNonHashPaths() { + String url = "s3://jvector-datasets-public/datasets-clean/ada_002_100k_base.fvecs"; + assertEquals(url, DataSetLoaderSimpleMFD.redact(url), "No hash segments — should be unchanged"); + } + + @Test + public void redactPreservesDatasetNames() { + // these have some hex-like chars but are clearly not secrets + assertEquals("/data/e5-base-v2-100k/file.fvecs", + DataSetLoaderSimpleMFD.redact("/data/e5-base-v2-100k/file.fvecs")); + assertEquals("/data/ada002-100k/file.fvecs", + DataSetLoaderSimpleMFD.redact("/data/ada002-100k/file.fvecs")); + assertEquals("s3://bucket/cohere-english-v3-1M/file.fvecs", + DataSetLoaderSimpleMFD.redact("s3://bucket/cohere-english-v3-1M/file.fvecs")); + } + + @Test + public void redactPreservesShortHexSegments() { + String path = "/data/a3f8b2/file.fvecs"; + assertEquals(path, DataSetLoaderSimpleMFD.redact(path)); + } + + @Test + public void redactHandlesNull() { + assertEquals("null", DataSetLoaderSimpleMFD.redact(null)); + } + + @Test + public void redactHandlesMultipleSecretSegments() { + String url = "s3://bucket/aaaa1111bbbb2222cccc3333dddd4444eeee5555ffff6666/sub/1111222233334444555566667777888899990000aaaabbbb/file.fvecs"; + String redacted = DataSetLoaderSimpleMFD.redact(url); + assertFalse(redacted.contains("aaaa1111"), "First hash should be redacted"); + assertFalse(redacted.contains("11112222"), "Second hash should be redacted"); + assertTrue(redacted.contains("/sub/"), "Non-hash path preserved"); + } + + @Test + public void redactHandlesWindowsPaths() { + String path = "C:\\data\\6174752eb60168112f2edb38493782da2ebe5ae6bfc870e25ed1711205e5395d\\file.fvecs"; + String redacted = DataSetLoaderSimpleMFD.redact(path); + assertFalse(redacted.contains("6174752e"), "Hash should be redacted in Windows paths: " + redacted); + } + + @Test + public void redactHandlesMixedSeparatorsInSecret() { + // underscores and dashes mixed with hex — still predominantly hex + String url = "s3://bucket/a1b2c3d4_e5f6a7b8-c9d0e1f2_a3b4c5d6/data.fvecs"; + String redacted = DataSetLoaderSimpleMFD.redact(url); + assertFalse(redacted.contains("a1b2c3d4"), "Mixed-separator hex should be redacted: " + redacted); + } + + // ======================================================================== + // _include cached remote catalogs + // ======================================================================== + + @Test + public void includeOnlyCatalogLoadsOfflineFromCachedRemoteCatalog() throws IOException { + // wrapper catalog points to a remote catalog and caches data files locally + Path remoteDir = tempFolder.newFolder("remote-catalog").toPath(); + Path cachedDataDir = tempFolder.newFolder("cached-public-data").toPath(); + + Files.writeString(remoteDir.resolve("catalog_entries.yaml"), + "test-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(remoteDir); + + HttpServer server = startFileServer(remoteDir); + try { + Files.writeString(cacheDir.resolve("public-catalog.yaml"), + "_include:\n" + + " url: " + urlFor(server, "catalog_entries.yaml") + "\n" + + "_defaults:\n" + + " cache_dir: " + cachedDataDir + "\n"); + + // first run online: include fetch succeeds and data files are cached locally + var onlineLoader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + var onlineDs = onlineLoader.loadDataSet("test-ds").orElseThrow().getDataSet(); + assertEquals(5, onlineDs.getBaseVectors().size()); + assertTrue(Files.exists(cachedDataDir.resolve("test_base.fvecs"))); + assertTrue(Files.exists(cachedDataDir.resolve("test_query.fvecs"))); + assertTrue(Files.exists(cachedDataDir.resolve("test_gt.ivecs"))); + } finally { + server.stop(0); + } + + // second run offline: include fetch fails, but the cached remote catalog still + // provides the dataset entry so the cached data files can be loaded + var offlineLoader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + var offlineDs = offlineLoader.loadDataSet("test-ds").orElseThrow().getDataSet(); + assertEquals(5, offlineDs.getBaseVectors().size()); + assertEquals(2, offlineDs.getQueryVectors().size()); + assertEquals(2, offlineDs.getGroundTruth().size()); + assertEquals(4, offlineDs.getDimension()); + } + + @Test + public void localCatalogOverridesCachedIncludedRemoteCatalogOffline() throws IOException { + // local dataset should win over a cached included remote dataset of the same name + Path remoteDir = tempFolder.newFolder("remote-catalog").toPath(); + Path cachedRemoteDir = tempFolder.newFolder("cached-public-data").toPath(); + Path localOverrideDir = tempFolder.newFolder("local-override").toPath(); + + Files.writeString(remoteDir.resolve("catalog_entries.yaml"), + "test-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(remoteDir); + + Files.writeString(cacheDir.resolve("local-override.yaml"), + "test-ds:\n" + + " cache_dir: " + localOverrideDir + "\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeLocalOverrideDataFiles(localOverrideDir); + + HttpServer server = startFileServer(remoteDir); + try { + Files.writeString(cacheDir.resolve("public-catalog.yaml"), + "_include:\n" + + " url: " + urlFor(server, "catalog_entries.yaml") + "\n" + + "_defaults:\n" + + " cache_dir: " + cachedRemoteDir + "\n"); + + // online construction fetches and caches the included remote catalog, + // but the local override should still win + var onlineLoader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + var onlineDs = onlineLoader.loadDataSet("test-ds").orElseThrow().getDataSet(); + assertEquals(1, onlineDs.getBaseVectors().size()); + } finally { + server.stop(0); + } + + // offline, the cached remote catalog should still not override the real local dataset + var offlineLoader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + var offlineDs = offlineLoader.loadDataSet("test-ds").orElseThrow().getDataSet(); + assertEquals(1, offlineDs.getBaseVectors().size()); + assertEquals(1, offlineDs.getQueryVectors().size()); + assertEquals(1, offlineDs.getGroundTruth().size()); + assertEquals(4, offlineDs.getDimension()); + } + + @Test + public void cachedIncludedRemoteCatalogStillFailsOfflineWhenDataFilesAreMissing() throws IOException { + // a cached remote catalog should not mask missing data files + Path remoteDir = tempFolder.newFolder("remote-catalog").toPath(); + Path cachedDataDir = tempFolder.newFolder("cached-public-data").toPath(); + + Files.writeString(remoteDir.resolve("catalog_entries.yaml"), + "test-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + writeTestDataFiles(remoteDir); + + HttpServer server = startFileServer(remoteDir); + try { + Files.writeString(cacheDir.resolve("public-catalog.yaml"), + "_include:\n" + + " url: " + urlFor(server, "catalog_entries.yaml") + "\n" + + "_defaults:\n" + + " cache_dir: " + cachedDataDir + "\n"); + + // construct once online so the included remote catalog is cached locally, + // but do not load the dataset, so the data files are not downloaded + new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + } finally { + server.stop(0); + } + + assertFalse(Files.exists(cachedDataDir.resolve("test_base.fvecs")), + "Precondition: dataset files should not have been downloaded"); + + var offlineLoader = new DataSetLoaderSimpleMFD( + null, cacheDir.toString(), false, testMetadata + ); + + assertThrows(RuntimeException.class, () -> offlineLoader.loadDataSet("test-ds"), + "Cached remote catalog should still fail when the chosen data files are missing offline"); + } + + // ======================================================================== + // Helpers + // ======================================================================== + + /// Starts a simple static HTTP file server rooted at the given directory. + private static HttpServer startFileServer(Path root) throws IOException { + HttpServer server = HttpServer.create(new InetSocketAddress("127.0.0.1", 0), 0); + server.createContext("/", exchange -> serveStaticFile(exchange, root)); + server.start(); + return server; + } + + /// Returns the full URL for a file served by the test HTTP server. + private static String urlFor(HttpServer server, String filename) { + return "http://127.0.0.1:" + server.getAddress().getPort() + "/" + filename; + } + + /// Serves a file from the given root directory, or 404 if it does not exist. + private static void serveStaticFile(HttpExchange exchange, Path root) throws IOException { + String requestPath = exchange.getRequestURI().getPath(); + String relativePath = requestPath.startsWith("/") ? requestPath.substring(1) : requestPath; + Path file = root.resolve(relativePath).normalize(); + + if (!file.startsWith(root) || !Files.isRegularFile(file)) { + exchange.sendResponseHeaders(404, -1); + exchange.close(); + return; + } + + byte[] bytes = Files.readAllBytes(file); + exchange.sendResponseHeaders(200, bytes.length); + try (OutputStream output = exchange.getResponseBody()) { + output.write(bytes); + } + } + + /// Writes a small local-override dataset so tests can distinguish it from the remote copy. + private static void writeLocalOverrideDataFiles(Path dir) throws IOException { + writeTestFvecs(dir.resolve("test_base.fvecs"), 4, new float[][] { + {1.0f, 1.0f, 0.0f, 0.0f}, + }); + writeTestFvecs(dir.resolve("test_query.fvecs"), 4, new float[][] { + {1.0f, 1.0f, 0.0f, 0.0f}, + }); + writeTestIvecs(dir.resolve("test_gt.ivecs"), new int[][] { + {0}, + }); + } + + private static void writeTestCatalog(Path dir) throws IOException { + Files.writeString(dir.resolve("catalog_entries.yaml"), + "test-ds:\n" + + " base: test_base.fvecs\n" + + " query: test_query.fvecs\n" + + " gt: test_gt.ivecs\n"); + } + + private static void writeTestDataFiles(Path dir) throws IOException { + writeTestFvecs(dir.resolve("test_base.fvecs"), 4, new float[][] { + {1.0f, 0.0f, 0.0f, 0.0f}, + {0.0f, 1.0f, 0.0f, 0.0f}, + {0.0f, 0.0f, 1.0f, 0.0f}, + {0.0f, 0.0f, 0.0f, 1.0f}, + {0.5f, 0.5f, 0.5f, 0.5f}, + }); + writeTestFvecs(dir.resolve("test_query.fvecs"), 4, new float[][] { + {1.0f, 0.0f, 0.0f, 0.0f}, + {0.0f, 0.0f, 1.0f, 0.0f}, + }); + writeTestIvecs(dir.resolve("test_gt.ivecs"), new int[][] { + {0, 4, 1, 2, 3}, + {2, 4, 0, 1, 3}, + }); + } + + /// Writes vectors in the standard fvecs format. + private static void writeTestFvecs(Path path, int dimension, float[][] vectors) throws IOException { + int bytesPerVector = Integer.BYTES + dimension * Float.BYTES; + var buf = ByteBuffer.allocate(vectors.length * bytesPerVector).order(ByteOrder.LITTLE_ENDIAN); + for (float[] vec : vectors) { + buf.putInt(dimension); + for (float v : vec) buf.putFloat(v); + } + Files.write(path, buf.array()); + } + + /// Writes ground truth in the standard ivecs format. + private static void writeTestIvecs(Path path, int[][] entries) throws IOException { + int totalBytes = 0; + for (int[] entry : entries) totalBytes += Integer.BYTES + entry.length * Integer.BYTES; + var buf = ByteBuffer.allocate(totalBytes).order(ByteOrder.LITTLE_ENDIAN); + for (int[] entry : entries) { + buf.putInt(entry.length); + for (int v : entry) buf.putInt(v); + } + Files.write(path, buf.array()); + } +} diff --git a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetPropertiesTest.java b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetPropertiesTest.java index 8ed60e5bb..13f2136aa 100644 --- a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetPropertiesTest.java +++ b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetPropertiesTest.java @@ -380,7 +380,7 @@ public void dataSetInfoLazyLoading() { @Test public void productionMetadataFileLoadsSuccessfully() { - // This validates the actual dataset_metadata.yml is well-formed + // This validates the actual dataset-metadata.yml is well-formed var reader = DataSetMetadataReader.load(); var props = reader.getProperties("ada002-100k"); assertTrue(props.isPresent(), "ada002-100k should be in the production metadata file"); @@ -391,7 +391,7 @@ public void productionMetadataFileLoadsSuccessfully() { public void productionMetadataAllEntriesHaveSimilarityFunction() { var reader = DataSetMetadataReader.load(); // All entries in the production metadata should have a similarity function - for (var name : new String[]{"cohere-english-v3-100k", "ada002-100k", "openai-v3-small-100k", + for (var name : new String[]{"cohere-english-v3-100k", "ada002-100k", "openai-v3-small-1536-100k", "gecko-100k", "openai-v3-large-3072-100k", "openai-v3-large-1536-100k", "e5-small-v2-100k", "e5-base-v2-100k", "e5-large-v2-100k", "ada002-1M", "colbert-1M"}) { diff --git a/jvector-examples/yaml-configs/dataset-catalogs/datasets.md b/jvector-examples/yaml-configs/dataset-catalogs/datasets.md new file mode 100644 index 000000000..ecfd0d9cf --- /dev/null +++ b/jvector-examples/yaml-configs/dataset-catalogs/datasets.md @@ -0,0 +1,148 @@ +# Hosting Datasets + +You can host and distribute your datasets remotely as long as they are available +via HTTPS or S3. This guide tells you how to do this. + +## Directory layout + +All `.yaml` and `.yml` files under this directory tree are discovered automatically at startup. +This is configured in DataSets.java as a loader parameter. + +``` +jvector-examples/ + yaml-configs/ + dataset-catalogs/ + public-catalog.yaml # _includes the public S3 catalog + local-catalog.yaml # reference/template with all options documented + my-team-datasets.yaml # name your own for your own datasets +``` + +## Quick start + +### Using public datasets + +Public datasets work out of the box. `public-catalog.yaml` uses +`_include` to pull the dataset catalog from S3, and files are downloaded on first use: + +```sh +# see what's available +curl -L https://jvector-datasets-public.s3.us-east-1.amazonaws.com/datasets-clean/catalog_entries.yaml +``` + +Downloaded files are cached locally in `dataset_cache/public/` by default. +Set the `DATASET_CACHE_DIR` environment variable to change this location. + +### Adding your own local datasets + +1. Create a `.yaml` file anywhere under this directory (e.g. `custom-catalog.yaml`). +2. Map each dataset name to its three files: + +```yaml +my-dataset: + base: /path/to/base_vectors.fvecs + query: /path/to/query_vectors.fvecs + gt: /path/to/ground_truth.ivecs +``` + +3. Add the appropriate settings to these files as well, so BenchYAML can use the datasets. + - `jvector-examples/yaml-configs/dataset-metadata.yml`: + - `jvector-examples/yaml-configs/datasets.yml` + +### Hosting remote datasets + +You can host datasets on any S3 bucket or HTTPS server. Each dataset needs three files +in fvecs/ivecs format (base vectors, query vectors, ground truth indices). + +**Option A: Use `_include` to reference a remote catalog** + +Create a thin local YAML that pulls entries from a remote `catalog_entries.yaml`: + +```yaml +_defaults: + cache_dir: ${DATASET_CACHE_DIR:-dataset_cache}/my-remote + +_include: + url: s3://my-bucket/datasets/catalog_entries.yaml +``` + +The remote catalog lists dataset entries in the same format. Its base path (the directory +containing the catalog file) is used as the default `base_url` for all included entries. + +**Option B: Use `base_url` per entry or in `_defaults`** + +```yaml +_defaults: + base_url: s3://my-bucket/datasets/ + cache_dir: ${DATASET_CACHE_DIR:-dataset_cache}/my-remote + +ada002-100k: + base: ada_002_100k_base.fvecs + query: ada_002_100k_query.fvecs + gt: ada_002_100k_gt.ivecs +``` + +File paths are appended to `base_url` for downloading. Files in subdirectories work too +(e.g. `base: subdir/file.fvecs` downloads from `s3://my-bucket/datasets/subdir/file.fvecs`). + +### Private datasets with secret paths + +Use `${VAR}` env var expansion to keep secrets out of committed files: + +```yaml +_defaults: + base_url: s3://my-bucket/${DATASET_SECRET_HASH}/ + cache_dir: ${DATASET_CACHE_DIR:-dataset_cache}/private + +dpr-1M: + base: dpr/base.fvecs + query: dpr/query.fvecs + gt: dpr/gt.ivecs +``` + +Set `DATASET_SECRET_HASH` in your environment. The `${VAR:-default}` syntax provides a +fallback value when the variable is not set. + +## Catalog file reference + +### Required fields (per dataset entry) + +| Field | Description | +|---------|-------------| +| `base` | Path to base vectors file (`.fvecs`) | +| `query` | Path to query vectors file (`.fvecs`) | +| `gt` | Path to ground truth indices file (`.ivecs`) | + +### Optional fields + +| Field | Description | +|-------------|-------------| +| `base_url` | Remote URL (S3 or HTTPS) to download files from when not cached locally | +| `cache_dir` | Local directory for cached files (relative or absolute path) | + +### Special entries + +| Key | Description | +|--------------|-------------| +| `_defaults` | Default values folded into all dataset entries in the same file. Entry-level values take precedence. | +| `_include` | Contains a `url` field pointing to a remote catalog. Remote entries are fetched and merged with local `_defaults`. | +| `_*` | Any root key starting with `_` is excluded from dataset names. | + +### Environment variables + +- Field values support `${VAR}` and `${VAR:-default}` syntax (bash-style). +- `${VAR}` expands to the environment variable value; throws an error if not set. +- `${VAR:-default}` uses the default when the variable is not set (including `${VAR:-}` for empty string). +- The `DATASET_CACHE_DIR` environment variable sets a global default `cache_dir` when none is specified at the entry or `_defaults` level. + +### Cache directory resolution order + +1. `cache_dir` on the dataset entry +2. `cache_dir` in `_defaults` +3. `DATASET_CACHE_DIR` environment variable +4. `dataset_cache/` under the repository root + +### Supported transport protocols + +- **S3** (`s3://bucket/path`) -- uses the AWS SDK with anonymous credentials +- **HTTPS** (`https://host/path`) -- uses Java's built-in HTTP client +- **Local files** -- no download; files are read directly from the resolved path diff --git a/jvector-examples/yaml-configs/dataset-catalogs/local-catalog.yaml b/jvector-examples/yaml-configs/dataset-catalogs/local-catalog.yaml new file mode 100644 index 000000000..0aee43b9a --- /dev/null +++ b/jvector-examples/yaml-configs/dataset-catalogs/local-catalog.yaml @@ -0,0 +1,41 @@ +# This file maps dataset names to their data files (base, query, ground truth). +# Any .yaml or .yml file under this directory tree is discovered automatically. +# +# Required fields per dataset: +# base - path to base vectors file (.fvecs) +# query - path to query vectors file (.fvecs) +# gt - path to ground truth indices file (.ivecs) +# +# Optional fields: +# base_url - remote URL (S3 or HTTP) to fetch files from when not cached locally +# cache_dir - local directory for cached files (relative or absolute path) +# +# Special entries: +# _defaults - provides default values folded into all other entries in this file. +# Entry-level values take precedence over defaults. +# Any root key starting with _ is excluded from dataset names. +# +# Environment variables: +# Field values support ${VAR} and ${VAR:-default} syntax (bash-style). +# ${VAR} expands to the environment variable value (error if not set). +# ${VAR:-default} uses the default value when the variable is not set. +# The DATASET_CACHE_DIR environment variable sets a global default cache_dir +# when none is specified at the entry or _defaults level. +# +# Example: +# +# _defaults: +# base_url: s3://my-bucket/${DATASET_HASH:-datasets}/ +# cache_dir: ${DATASET_CACHE_DIR:-/tmp/dataset-cache} +# +# my_local_data: +# base: path_to_base_vectors.fvecs +# query: path_to_query_vectors.fvecs +# gt: path_to_ground_truth_indices.ivecs +# +# my_remote_data: +# base_url: s3://my-bucket/${SECRET_HASH}/ +# cache_dir: /fast-ssd/private +# base: private_base.fvecs +# query: private_query.fvecs +# gt: private_gt.ivecs diff --git a/jvector-examples/yaml-configs/dataset-catalogs/public-catalog.yaml b/jvector-examples/yaml-configs/dataset-catalogs/public-catalog.yaml new file mode 100644 index 000000000..cf0188de3 --- /dev/null +++ b/jvector-examples/yaml-configs/dataset-catalogs/public-catalog.yaml @@ -0,0 +1,14 @@ +# Please do not modify this file. +# +# Hint: you can see what's available in our public-catalog by running +# curl -L https://jvector-datasets-public.s3.us-east-1.amazonaws.com/datasets-clean/catalog_entries.yaml + +_include: + url: s3://jvector-datasets-public/datasets-clean/catalog_entries.yaml + +# This sets the local cache directory for the datasets. +# If it is not set, the dataset will be cached under dataset_cache/ +_defaults: + cache_dir: ${DATASET_CACHE_DIR:-dataset_cache}/public +# cache_dir: ${HOME}/... +# cache_dir: /absolute/path/... diff --git a/jvector-examples/yaml-configs/dataset-catalogs/sharing.md b/jvector-examples/yaml-configs/dataset-catalogs/sharing.md new file mode 100644 index 000000000..288e319f5 --- /dev/null +++ b/jvector-examples/yaml-configs/dataset-catalogs/sharing.md @@ -0,0 +1,33 @@ +# Sharing a Dataset + +To share a remotely hosted dataset: + +1. **Prepare your files** -- you need three files in fvecs/ivecs format: + - `base_vectors.fvecs` -- the vectors to index + - `query_vectors.fvecs` -- the vectors to search with + - `ground_truth.ivecs` -- the known nearest neighbor indices for each query + +2. **Upload them** to an S3 bucket or HTTPS-accessible location. + +3. **Create a catalog file** (any `.yaml` file) listing the dataset: + ```yaml + _defaults: + base_url: https://my-server.com/datasets/ + + my-dataset: + base: my_base_vectors.fvecs + query: my_query_vectors.fvecs + gt: my_ground_truth.ivecs + ``` + +4. **Distribute the catalog file.** Recipients drop it into + `jvector-examples/yaml-configs/dataset-catalogs/` and the loader picks it up automatically. + Remote files are downloaded on first use. Downloaded files are cached locally. + +For private datasets, use `${VAR}` in the `base_url` to keep secret paths out of the file: +```yaml +_defaults: + base_url: s3://my-bucket/${SECRET_HASH}/ +``` + +See [datasets.md](datasets.md) for the full configuration reference. diff --git a/jvector-examples/yaml-configs/dataset-metadata.yml b/jvector-examples/yaml-configs/dataset-metadata.yml new file mode 100644 index 000000000..fe48746d1 --- /dev/null +++ b/jvector-examples/yaml-configs/dataset-metadata.yml @@ -0,0 +1,99 @@ +# This file contains authoritative metadata for curated benchmark datasets whose +# raw formats do not carry the properties we need at runtime. +# +# The loaders use this file to determine dataset properties, such +# as similarity_function and load_behavior. +# +# load_behavior controls benchmark-loader processing: +# LEGACY_SCRUB - Use (the soon-to-be deprecated) load-time scrubbing behavior +# NO_SCRUB - load vectors and ground truth exactly as stored +# +# In JVector 4.0.0-rc.8 and earlier, datasets were scrubbed at load-time using methods +# that were found to have problems. These are enabled using LEGACY_SCRUB, which should +# only be used to reproduce historical run results. +# +# Additional metadata requires corresponding support in DataSetProperties and the +# relevant loader code. + +ada002-100k: + similarity_function: COSINE + load_behavior: NO_SCRUB +ada002-1M: + similarity_function: COSINE + load_behavior: NO_SCRUB +cap-1M: + similarity_function: COSINE + load_behavior: NO_SCRUB +cap-6M: + similarity_function: COSINE + load_behavior: NO_SCRUB +cohere-english-v3-100k: + similarity_function: COSINE + load_behavior: NO_SCRUB +cohere-english-v3-1M: + similarity_function: COSINE + load_behavior: NO_SCRUB +cohere-english-v3-10M: + similarity_function: COSINE + load_behavior: NO_SCRUB +colbert-1M: + similarity_function: COSINE + load_behavior: NO_SCRUB +colbert-10M: + similarity_function: COSINE + load_behavior: NO_SCRUB +degen-200k: + similarity_function: COSINE + load_behavior: NO_SCRUB +dpr-1M: + similarity_function: COSINE + load_behavior: NO_SCRUB +dpr-10M: + similarity_function: COSINE + load_behavior: NO_SCRUB +e5-small-v2-100k: + similarity_function: COSINE + load_behavior: NO_SCRUB +e5-base-v2-100k: + similarity_function: COSINE + load_behavior: NO_SCRUB +e5-large-v2-100k: + similarity_function: COSINE + load_behavior: NO_SCRUB +gecko-100k: + similarity_function: COSINE + load_behavior: NO_SCRUB +nv-qa-v4-100k: + similarity_function: COSINE + load_behavior: NO_SCRUB +openai-v3-small-1536-100k: + similarity_function: COSINE + load_behavior: NO_SCRUB +openai-v3-large-3072-100k: + similarity_function: COSINE + load_behavior: NO_SCRUB +openai-v3-large-1536-100k: + similarity_function: COSINE + load_behavior: NO_SCRUB +# ann-benchmarks +glove-25-angular: + similarity_function: COSINE + load_behavior: NO_SCRUB +glove-50-angular: + similarity_function: COSINE + load_behavior: NO_SCRUB +lastfm-64-dot: + similarity_function: DOT_PRODUCT + load_behavior: NO_SCRUB +glove-100-angular: + similarity_function: COSINE + load_behavior: NO_SCRUB +glove-200-angular: + similarity_function: COSINE + load_behavior: NO_SCRUB +nytimes-256-angular: + similarity_function: COSINE + load_behavior: NO_SCRUB +sift-128-euclidean: + similarity_function: EUCLIDEAN + load_behavior: NO_SCRUB \ No newline at end of file diff --git a/jvector-examples/yaml-configs/dataset_metadata.yml b/jvector-examples/yaml-configs/dataset_metadata.yml deleted file mode 100644 index 21e5e69f9..000000000 --- a/jvector-examples/yaml-configs/dataset_metadata.yml +++ /dev/null @@ -1,99 +0,0 @@ -# This file contains authoritative metadata for curated benchmark datasets whose -# raw formats do not carry the properties we need at runtime. -# -# Both the MFD and HDF5 loaders use this file to determine dataset properties such -# as similarity_function and load_behavior. -# -# load_behavior controls benchmark-loader processing: -# LEGACY_SCRUB - preserve the current load-time scrubbing behavior -# NO_SCRUB - load vectors and ground truth exactly as stored -# -# During the transition, existing deployed datasets should generally remain on -# LEGACY_SCRUB until their prescrubbed replacements and matching offline ground -# truth are ready. New prescrubbed datasets should use NO_SCRUB. -# -# Additional metadata requires corresponding support in DataSetProperties and the -# relevant loader code. - -ada002-100k: - similarity_function: COSINE - load_behavior: LEGACY_SCRUB -ada002-1M: - similarity_function: COSINE - load_behavior: LEGACY_SCRUB -cap-1M: - similarity_function: DOT_PRODUCT - load_behavior: LEGACY_SCRUB -cap-6M: - similarity_function: DOT_PRODUCT - load_behavior: LEGACY_SCRUB -cohere-english-v3-100k: - similarity_function: COSINE - load_behavior: LEGACY_SCRUB -cohere-english-v3-1M: - similarity_function: COSINE - load_behavior: LEGACY_SCRUB -cohere-english-v3-10M: - similarity_function: COSINE - load_behavior: LEGACY_SCRUB -colbert-1M: - similarity_function: COSINE - load_behavior: LEGACY_SCRUB -colbert-10M: - similarity_function: COSINE - load_behavior: LEGACY_SCRUB -degen-200k: - similarity_function: COSINE - load_behavior: LEGACY_SCRUB -dpr-1M: - similarity_function: DOT_PRODUCT - load_behavior: LEGACY_SCRUB -dpr-10M: - similarity_function: DOT_PRODUCT - load_behavior: LEGACY_SCRUB -e5-small-v2-100k: - similarity_function: COSINE - load_behavior: LEGACY_SCRUB -e5-base-v2-100k: - similarity_function: COSINE - load_behavior: LEGACY_SCRUB -e5-large-v2-100k: - similarity_function: COSINE - load_behavior: LEGACY_SCRUB -gecko-100k: - similarity_function: COSINE - load_behavior: LEGACY_SCRUB -nv-qa-v4-100k: - similarity_function: COSINE - load_behavior: LEGACY_SCRUB -openai-v3-small-100k: - similarity_function: COSINE - load_behavior: LEGACY_SCRUB -openai-v3-large-3072-100k: - similarity_function: COSINE - load_behavior: LEGACY_SCRUB -openai-v3-large-1536-100k: - similarity_function: COSINE - load_behavior: LEGACY_SCRUB -# ann-benchmarks -glove-25-angular.hdf5: - similarity_function: COSINE - load_behavior: LEGACY_SCRUB -glove-50-angular.hdf5: - similarity_function: COSINE - load_behavior: LEGACY_SCRUB -lastfm-64-dot.hdf5: - similarity_function: DOT_PRODUCT - load_behavior: LEGACY_SCRUB -glove-100-angular.hdf5: - similarity_function: COSINE - load_behavior: LEGACY_SCRUB -glove-200-angular.hdf5: - similarity_function: COSINE - load_behavior: LEGACY_SCRUB -nytimes-256-angular.hdf5: - similarity_function: COSINE - load_behavior: LEGACY_SCRUB -sift-128-euclidean.hdf5: - similarity_function: EUCLIDEAN - load_behavior: LEGACY_SCRUB \ No newline at end of file diff --git a/jvector-examples/yaml-configs/datasets.yml b/jvector-examples/yaml-configs/datasets.yml index a35555704..58d23116e 100644 --- a/jvector-examples/yaml-configs/datasets.yml +++ b/jvector-examples/yaml-configs/datasets.yml @@ -1,23 +1,30 @@ -neighborhood-watch-100k: +jvector-100k: - cohere-english-v3-100k - ada002-100k - - openai-v3-small-100k + - openai-v3-small-1536-100k - gecko-100k - openai-v3-large-3072-100k - openai-v3-large-1536-100k - e5-small-v2-100k - e5-base-v2-100k - e5-large-v2-100k -neighborhood-watch-1M: +jvector-1M: - ada002-1M - colbert-1M ann-benchmarks: - - glove-25-angular.hdf5 - - glove-50-angular.hdf5 - - lastfm-64-dot.hdf5 - - glove-100-angular.hdf5 - - glove-200-angular.hdf5 - - nytimes-256-angular.hdf5 - - sift-128-euclidean.hdf5 - # - deep-image-96-angular.hdf5 # large files not yet supported - # - gist-960-euclidean.hdf5 # large files not yet supported \ No newline at end of file + - glove-25-angular + - glove-50-angular + - lastfm-64-dot + - glove-100-angular + - glove-200-angular + - nytimes-256-angular + - sift-128-euclidean +#other-datasets: +# - dpr-1M +# - dpr-10M +# - cap-1M +# - cap-6M +# - cohere-english-v3-1M +# - cohere-english-v3-10M +# - deep-image-96-angular # large files not yet supported +# - gist-960-euclidean # large files not yet supported \ No newline at end of file diff --git a/jvector-examples/yaml-configs/autoDefault.yml b/jvector-examples/yaml-configs/index-parameters/autoDefault.yml similarity index 100% rename from jvector-examples/yaml-configs/autoDefault.yml rename to jvector-examples/yaml-configs/index-parameters/autoDefault.yml diff --git a/jvector-examples/yaml-configs/colbert-1M.yml b/jvector-examples/yaml-configs/index-parameters/colbert-1M.yml similarity index 95% rename from jvector-examples/yaml-configs/colbert-1M.yml rename to jvector-examples/yaml-configs/index-parameters/colbert-1M.yml index b9e6c72b7..d0de5e0f4 100644 --- a/jvector-examples/yaml-configs/colbert-1M.yml +++ b/jvector-examples/yaml-configs/index-parameters/colbert-1M.yml @@ -24,4 +24,4 @@ search: compression: - type: None -# Run-level controls, such as benchmarks, console, and logging, are in run.yml. \ No newline at end of file +# Run-level controls, such as benchmarks, console, and logging, are in run-config.yml. \ No newline at end of file diff --git a/jvector-examples/yaml-configs/default.yml b/jvector-examples/yaml-configs/index-parameters/default.yml similarity index 98% rename from jvector-examples/yaml-configs/default.yml rename to jvector-examples/yaml-configs/index-parameters/default.yml index 346a701e4..b56e27ed0 100644 --- a/jvector-examples/yaml-configs/default.yml +++ b/jvector-examples/yaml-configs/index-parameters/default.yml @@ -36,4 +36,4 @@ search: centerData: No anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy) -# Run-level controls, such as benchmarks, console, and logging, are in run.yml. \ No newline at end of file +# Run-level controls, such as benchmarks, console, and logging, are in run-config.yml. \ No newline at end of file diff --git a/jvector-examples/yaml-configs/glove-100-angular.yml b/jvector-examples/yaml-configs/index-parameters/glove-100-angular.yml similarity index 100% rename from jvector-examples/yaml-configs/glove-100-angular.yml rename to jvector-examples/yaml-configs/index-parameters/glove-100-angular.yml diff --git a/jvector-examples/yaml-configs/glove-200-angular.yml b/jvector-examples/yaml-configs/index-parameters/glove-200-angular.yml similarity index 100% rename from jvector-examples/yaml-configs/glove-200-angular.yml rename to jvector-examples/yaml-configs/index-parameters/glove-200-angular.yml diff --git a/jvector-examples/yaml-configs/glove-25-angular.yml b/jvector-examples/yaml-configs/index-parameters/glove-25-angular.yml similarity index 100% rename from jvector-examples/yaml-configs/glove-25-angular.yml rename to jvector-examples/yaml-configs/index-parameters/glove-25-angular.yml diff --git a/jvector-examples/yaml-configs/glove-50-angular.yml b/jvector-examples/yaml-configs/index-parameters/glove-50-angular.yml similarity index 100% rename from jvector-examples/yaml-configs/glove-50-angular.yml rename to jvector-examples/yaml-configs/index-parameters/glove-50-angular.yml diff --git a/jvector-examples/yaml-configs/lastfm-64-dot.yml b/jvector-examples/yaml-configs/index-parameters/lastfm-64-dot.yml similarity index 100% rename from jvector-examples/yaml-configs/lastfm-64-dot.yml rename to jvector-examples/yaml-configs/index-parameters/lastfm-64-dot.yml diff --git a/jvector-examples/yaml-configs/nytimes-256-angular.yml b/jvector-examples/yaml-configs/index-parameters/nytimes-256-angular.yml similarity index 100% rename from jvector-examples/yaml-configs/nytimes-256-angular.yml rename to jvector-examples/yaml-configs/index-parameters/nytimes-256-angular.yml diff --git a/jvector-examples/yaml-configs/sift-128-euclidean.yml b/jvector-examples/yaml-configs/index-parameters/sift-128-euclidean.yml similarity index 100% rename from jvector-examples/yaml-configs/sift-128-euclidean.yml rename to jvector-examples/yaml-configs/index-parameters/sift-128-euclidean.yml diff --git a/jvector-examples/yaml-configs/run.yml b/jvector-examples/yaml-configs/run-config.yml similarity index 100% rename from jvector-examples/yaml-configs/run.yml rename to jvector-examples/yaml-configs/run-config.yml diff --git a/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java b/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java index 8e9cc712f..28127fb34 100644 --- a/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java +++ b/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java @@ -17,7 +17,7 @@ import io.github.jbellis.jvector.example.benchmarks.datasets.DataSet; -import io.github.jbellis.jvector.example.benchmarks.datasets.DataSetLoaderHDF5; +import io.github.jbellis.jvector.example.benchmarks.datasets.DataSets; import io.github.jbellis.jvector.graph.GraphIndexBuilder; import io.github.jbellis.jvector.graph.ListRandomAccessVectorValues; import org.openjdk.jmh.annotations.Benchmark; @@ -44,8 +44,8 @@ public static class Parameters { final ListRandomAccessVectorValues ravv; public Parameters() { - this.ds = new DataSetLoaderHDF5().loadDataSet("hdf5/glove-100-angular.hdf5").orElseThrow( - () -> new RuntimeException("Unable to load dataset: hdf5/glove-100-angular.hdf5" ) + this.ds = DataSets.loadDataSet("glove-100-angular").orElseThrow( + () -> new RuntimeException("Unable to load dataset: glove-100-angular") ).getDataSet(); this.ravv = new ListRandomAccessVectorValues(ds.getBaseVectors(), ds.getBaseVectors().get(0).length()); } diff --git a/rat-excludes.txt b/rat-excludes.txt index 64aeba7a3..436c97822 100644 --- a/rat-excludes.txt +++ b/rat-excludes.txt @@ -24,6 +24,11 @@ src/test/resources/log4j2-test.xml results.csv scripts/test_node_setup.sh scripts/jmh_results_formatter.py -yaml-configs/*.yml +yaml-configs/**/*.yaml +yaml-configs/**/*.yml src/main/resources/logback.xml docs/**/*.md +yaml-configs/**/*.md +local_datasets/** +**/datasets/** +