Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 2 additions & 30 deletions packages/viewer/src/search/search.worker.ts
Original file line number Diff line number Diff line change
@@ -1,41 +1,13 @@
// Copyright (c) 2025 Apple Inc. Licensed under MIT License.

import { createWorkerRuntime } from "@embedding-atlas/utils";
import { Charset, Index, type IndexOptions } from "flexsearch";

import { SearchIndex } from "./search_index.js";

let { handler, registerClass } = createWorkerRuntime();

onmessage = handler;

const options: IndexOptions = {
tokenize: "forward",
encoder: Charset.LatinBalance,
};

class SearchIndex {
private index: Index;

constructor() {
this.index = new Index(options);
}

clear() {
this.index.clear();
this.index.cleanup();
this.index = new Index(options);
}

addPoints(points: { id: string | number; text: string }[]) {
for (let p of points) {
this.index.add(p.id, p.text);
}
}

query(query: string, limit: number): (string | number)[] {
return this.index.search(query, { limit });
}
}

export type { SearchIndex };

registerClass("SearchIndex", () => new SearchIndex());
131 changes: 131 additions & 0 deletions packages/viewer/src/search/search_index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
// Copyright (c) 2025 Apple Inc. Licensed under MIT License.

import { Charset, Index, type IndexOptions } from "flexsearch";

const options: IndexOptions = {
tokenize: "forward",
encoder: Charset.LatinBalance,
};

/**
* A query parsed into exact phrases and the remaining free text.
*
* Double-quoted runs become exact phrases, everything outside the quotes is
* collected as free text. For example `"aldi" store` parses to one phrase
* (`aldi`) plus the free text `store`.
*/
export interface ParsedQuery {
phrases: string[];
freeText: string;
}

/**
* Parse a query into exact phrases and free text.
*
* A double-quoted run (e.g. `"aldi"`) means the user wants an exact,
* case-insensitive substring match instead of the default fuzzy token search.
* The default encoder maps similar-looking words to the same token (for example
* "aldi" and "aldea"), which is great for fuzzy recall but surfaces unwanted
* matches when the user knows exactly what they are looking for. Quoting opts
* out of that behavior for the quoted run while leaving any unquoted words on
* the fuzzy path, so `"aldi" store` requires the exact substring "aldi" and
* fuzzy-matches "store".
*
* Empty quotes (`""`) contribute no phrase. An unterminated trailing quote is
* treated as a literal character of the free text so a half-typed query still
* searches.
*/
export function parseQuery(query: string): ParsedQuery {
let phrases: string[] = [];
let freeText: string[] = [];
let rest = query;

while (true) {
let open = rest.indexOf('"');
if (open < 0) {
freeText.push(rest);
break;
}
let close = rest.indexOf('"', open + 1);
if (close < 0) {
// No closing quote, keep the remainder as free text verbatim.
freeText.push(rest);
break;
}
freeText.push(rest.slice(0, open));
let inner = rest.slice(open + 1, close);
if (inner.length > 0) {
phrases.push(inner);
}
rest = rest.slice(close + 1);
}

return { phrases, freeText: freeText.join(" ").trim() };
}

/**
* Full text search index backed by flexsearch.
*
* In addition to the default fuzzy token search, the index keeps the original
* text for each point so a quoted query can perform an exact, case-insensitive
* substring match.
*/
export class SearchIndex {
private index: Index;
private texts: Map<string | number, string>;

constructor() {
this.index = new Index(options);
this.texts = new Map();
}

clear() {
this.index.clear();
this.index.cleanup();
this.index = new Index(options);
this.texts = new Map();
}

addPoints(points: { id: string | number; text: string }[]) {
for (let p of points) {
this.index.add(p.id, p.text);
this.texts.set(p.id, p.text);
}
}

query(query: string, limit: number): (string | number)[] {
let { phrases, freeText } = parseQuery(query);

// No exact phrases: keep the original fuzzy path untouched.
if (phrases.length === 0) {
return this.index.search(freeText, { limit });
}

// Candidate ids that contain every required phrase as a substring. When
// there is also free text, narrow to the fuzzy hits for that text so the
// phrases act as a filter on top of the normal ranking.
let candidates: Iterable<string | number>;
if (freeText.length > 0) {
candidates = this.index.search(freeText) as (string | number)[];
} else {
candidates = this.texts.keys();
}

let needles = phrases.map((p) => p.toLowerCase());
let result: (string | number)[] = [];
for (let id of candidates) {
let text = this.texts.get(id);
if (text == null) {
continue;
}
let haystack = text.toLowerCase();
if (needles.every((needle) => haystack.includes(needle))) {
result.push(id);
if (result.length >= limit) {
break;
}
}
}
return result;
}
}
99 changes: 99 additions & 0 deletions packages/viewer/test/search_index.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
// Copyright (c) 2025 Apple Inc. Licensed under MIT License.

import { describe, expect, test } from "vitest";

import { parseQuery, SearchIndex } from "../src/search/search_index.js";

describe("parseQuery", () => {
test("an unquoted query is all free text with no phrases", () => {
expect(parseQuery("aldi")).toEqual({ phrases: [], freeText: "aldi" });
expect(parseQuery("aldi store")).toEqual({ phrases: [], freeText: "aldi store" });
});

test("a fully quoted query is a single phrase with no free text", () => {
expect(parseQuery('"aldi"')).toEqual({ phrases: ["aldi"], freeText: "" });
expect(parseQuery('"new york"')).toEqual({ phrases: ["new york"], freeText: "" });
});

test("a mixed query splits phrases from free text", () => {
expect(parseQuery('"aldi" store')).toEqual({ phrases: ["aldi"], freeText: "store" });
expect(parseQuery('store "aldi"')).toEqual({ phrases: ["aldi"], freeText: "store" });
expect(parseQuery('"a" "b" c')).toEqual({ phrases: ["a", "b"], freeText: "c" });
});

test("empty quotes contribute no phrase", () => {
expect(parseQuery('""')).toEqual({ phrases: [], freeText: "" });
expect(parseQuery('"" store')).toEqual({ phrases: [], freeText: "store" });
});

test("an unterminated quote stays in the free text", () => {
expect(parseQuery('"aldi')).toEqual({ phrases: [], freeText: '"aldi' });
expect(parseQuery('store "aldi')).toEqual({ phrases: [], freeText: 'store "aldi' });
});
});

describe("SearchIndex exact-phrase search", () => {
// Mirrors the report in issue #137: the fuzzy encoder maps "aldi" and
// "aldea" to the same tokens, so a plain search for "aldi" surfaces
// "ALDEA HOMES" before the real "ALDI" rows.
const points = [
{ id: 1, text: "ALDEA HOMES" },
{ id: 2, text: "ALDEA HOMES TWO" },
{ id: 3, text: "ALDI Supermarket" },
{ id: 4, text: "Corner ALDI" },
{ id: 5, text: "Walmart" },
{ id: 6, text: "ALDI store downtown" },
];

test("the fuzzy search reproduces the false matches from the issue", () => {
let index = new SearchIndex();
index.addPoints(points);
let result = index.query("aldi", 100);
// The fuzzy encoder returns the "ALDEA" rows alongside the real matches,
// which is the behavior the quoted search is meant to avoid.
expect(result).toContain(1);
expect(result).toContain(2);
});

test("a quoted query matches only the exact substring, case-insensitively", () => {
let index = new SearchIndex();
index.addPoints(points);
let result = index.query('"aldi"', 100);
expect(new Set(result)).toEqual(new Set([3, 4, 6]));
expect(result).not.toContain(1);
expect(result).not.toContain(2);
expect(result).not.toContain(5);
});

test("a mixed query requires the phrase and fuzzy-matches the free text", () => {
let index = new SearchIndex();
index.addPoints(points);
// "aldi" must appear exactly, and "store" narrows via the fuzzy index, so
// only the row that contains both survives.
let result = index.query('"aldi" store', 100);
expect(new Set(result)).toEqual(new Set([6]));
expect(result).not.toContain(3);
expect(result).not.toContain(4);
});

test("multiple phrases must all be present", () => {
let index = new SearchIndex();
index.addPoints(points);
expect(new Set(index.query('"aldi" "store"', 100))).toEqual(new Set([6]));
expect(index.query('"aldi" "walmart"', 100)).toEqual([]);
});

test("a quoted query respects the limit", () => {
let index = new SearchIndex();
index.addPoints(points);
let result = index.query('"aldi"', 1);
expect(result.length).toBe(1);
});

test("clear resets both the fuzzy index and the exact-match texts", () => {
let index = new SearchIndex();
index.addPoints(points);
index.clear();
expect(index.query('"aldi"', 100)).toEqual([]);
});
});