Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 2 additions & 30 deletions packages/viewer/src/search/search.worker.ts
Original file line number Diff line number Diff line change
@@ -1,41 +1,13 @@
// Copyright (c) 2025 Apple Inc. Licensed under MIT License.

import { createWorkerRuntime } from "@embedding-atlas/utils";
import { Charset, Index, type IndexOptions } from "flexsearch";

import { SearchIndex } from "./search_index.js";

let { handler, registerClass } = createWorkerRuntime();

onmessage = handler;

const options: IndexOptions = {
tokenize: "forward",
encoder: Charset.LatinBalance,
};

class SearchIndex {
private index: Index;

constructor() {
this.index = new Index(options);
}

clear() {
this.index.clear();
this.index.cleanup();
this.index = new Index(options);
}

addPoints(points: { id: string | number; text: string }[]) {
for (let p of points) {
this.index.add(p.id, p.text);
}
}

query(query: string, limit: number): (string | number)[] {
return this.index.search(query, { limit });
}
}

export type { SearchIndex };

registerClass("SearchIndex", () => new SearchIndex());
83 changes: 83 additions & 0 deletions packages/viewer/src/search/search_index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
// Copyright (c) 2025 Apple Inc. Licensed under MIT License.

import { Charset, Index, type IndexOptions } from "flexsearch";

const options: IndexOptions = {
tokenize: "forward",
encoder: Charset.LatinBalance,
};

/**
* Parse a query string for an exact-phrase request.
*
* A query wrapped in double quotes (e.g. `"aldi"`) means the user wants an
* exact, case-insensitive substring match instead of the default fuzzy token
* search. The default encoder maps similar-looking words to the same token
* (for example "aldi" and "aldea"), which is great for fuzzy recall but
* surfaces unwanted matches when the user knows exactly what they are looking
* for. Quoting opts out of that behavior.
*
* Returns the inner phrase when the query is a quoted exact phrase, otherwise
* null. An empty quoted string (`""`) is treated as not a phrase so the caller
* falls back to the regular path.
*/
export function parseExactPhrase(query: string): string | null {
if (query.length >= 2 && query.startsWith('"') && query.endsWith('"')) {
let inner = query.slice(1, -1);
return inner.length > 0 ? inner : null;

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be simpler to check for longer than 2 above instead of longer or equal?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, and your other comment about "aldi" store pushed me to rethink this whole parse step. I replaced parseExactPhrase with a parseQuery that walks the string quote by quote, so the special-case length guard is gone entirely. A quoted run becomes a phrase, anything outside the quotes stays free text, and empty quotes just contribute nothing. That removed the >= 2 vs > 2 ambiguity you flagged here.

}
return null;
}

/**
* Full text search index backed by flexsearch.
*
* In addition to the default fuzzy token search, the index keeps the original
* text for each point so a quoted query can perform an exact, case-insensitive
* substring match.
*/
export class SearchIndex {
private index: Index;
private texts: Map<string | number, string>;

constructor() {
this.index = new Index(options);
this.texts = new Map();
}

clear() {
this.index.clear();
this.index.cleanup();
this.index = new Index(options);
this.texts = new Map();
}

addPoints(points: { id: string | number; text: string }[]) {
for (let p of points) {
this.index.add(p.id, p.text);
this.texts.set(p.id, p.text);
}
}

query(query: string, limit: number): (string | number)[] {
let phrase = parseExactPhrase(query);
if (phrase != null) {
return this.exactSearch(phrase, limit);
}
return this.index.search(query, { limit });
}

private exactSearch(phrase: string, limit: number): (string | number)[] {
let needle = phrase.toLowerCase();
let result: (string | number)[] = [];
for (let [id, text] of this.texts) {
if (text.toLowerCase().includes(needle)) {
result.push(id);
if (result.length >= limit) {
break;
}
}
}
return result;
}
}
70 changes: 70 additions & 0 deletions packages/viewer/test/search_index.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// Copyright (c) 2025 Apple Inc. Licensed under MIT License.

import { describe, expect, test } from "vitest";

import { parseExactPhrase, SearchIndex } from "../src/search/search_index.js";

describe("parseExactPhrase", () => {
test("returns the inner phrase for a quoted query", () => {
expect(parseExactPhrase('"aldi"')).toBe("aldi");
expect(parseExactPhrase('"new york"')).toBe("new york");
});

test("returns null for an unquoted query", () => {
expect(parseExactPhrase("aldi")).toBe(null);
expect(parseExactPhrase('aldi"')).toBe(null);
expect(parseExactPhrase('"aldi')).toBe(null);
});

test("returns null for an empty quoted query", () => {
expect(parseExactPhrase('""')).toBe(null);
expect(parseExactPhrase('"')).toBe(null);
});
});

describe("SearchIndex exact-phrase search", () => {
// Mirrors the report in issue #137: the fuzzy encoder maps "aldi" and
// "aldea" to the same tokens, so a plain search for "aldi" surfaces
// "ALDEA HOMES" before the real "ALDI" rows.
const points = [
{ id: 1, text: "ALDEA HOMES" },
{ id: 2, text: "ALDEA HOMES TWO" },
{ id: 3, text: "ALDI Supermarket" },
{ id: 4, text: "Corner ALDI" },
{ id: 5, text: "Walmart" },
];

test("the fuzzy search reproduces the false matches from the issue", () => {
let index = new SearchIndex();
index.addPoints(points);
let result = index.query("aldi", 100);
// The fuzzy encoder returns the "ALDEA" rows alongside the real matches,
// which is the behavior the quoted search is meant to avoid.
expect(result).toContain(1);
expect(result).toContain(2);
});

test("a quoted query matches only the exact substring, case-insensitively", () => {
let index = new SearchIndex();
index.addPoints(points);
let result = index.query('"aldi"', 100);
expect(new Set(result)).toEqual(new Set([3, 4]));
expect(result).not.toContain(1);
expect(result).not.toContain(2);
expect(result).not.toContain(5);
});

test("a quoted query respects the limit", () => {
let index = new SearchIndex();
index.addPoints(points);
let result = index.query('"aldi"', 1);
expect(result.length).toBe(1);
});

test("clear resets both the fuzzy index and the exact-match texts", () => {
let index = new SearchIndex();
index.addPoints(points);
index.clear();
expect(index.query('"aldi"', 100)).toEqual([]);
});
});
Loading