-
Notifications
You must be signed in to change notification settings - Fork 302
fix: support quoted exact-phrase full text search #204
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
adityasingh2400
wants to merge
2
commits into
apple:main
Choose a base branch
from
adityasingh2400:fix-search-exact-phrase
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+232
−30
Open
Changes from 1 commit
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,41 +1,13 @@ | ||
| // Copyright (c) 2025 Apple Inc. Licensed under MIT License. | ||
|
|
||
| import { createWorkerRuntime } from "@embedding-atlas/utils"; | ||
| import { Charset, Index, type IndexOptions } from "flexsearch"; | ||
|
|
||
| import { SearchIndex } from "./search_index.js"; | ||
|
|
||
| let { handler, registerClass } = createWorkerRuntime(); | ||
|
|
||
| onmessage = handler; | ||
|
|
||
| const options: IndexOptions = { | ||
| tokenize: "forward", | ||
| encoder: Charset.LatinBalance, | ||
| }; | ||
|
|
||
| class SearchIndex { | ||
| private index: Index; | ||
|
|
||
| constructor() { | ||
| this.index = new Index(options); | ||
| } | ||
|
|
||
| clear() { | ||
| this.index.clear(); | ||
| this.index.cleanup(); | ||
| this.index = new Index(options); | ||
| } | ||
|
|
||
| addPoints(points: { id: string | number; text: string }[]) { | ||
| for (let p of points) { | ||
| this.index.add(p.id, p.text); | ||
| } | ||
| } | ||
|
|
||
| query(query: string, limit: number): (string | number)[] { | ||
| return this.index.search(query, { limit }); | ||
| } | ||
| } | ||
|
|
||
| export type { SearchIndex }; | ||
|
|
||
| registerClass("SearchIndex", () => new SearchIndex()); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,83 @@ | ||
| // Copyright (c) 2025 Apple Inc. Licensed under MIT License. | ||
|
|
||
| import { Charset, Index, type IndexOptions } from "flexsearch"; | ||
|
|
||
| const options: IndexOptions = { | ||
| tokenize: "forward", | ||
| encoder: Charset.LatinBalance, | ||
| }; | ||
|
|
||
| /** | ||
| * Parse a query string for an exact-phrase request. | ||
| * | ||
| * A query wrapped in double quotes (e.g. `"aldi"`) means the user wants an | ||
| * exact, case-insensitive substring match instead of the default fuzzy token | ||
| * search. The default encoder maps similar-looking words to the same token | ||
| * (for example "aldi" and "aldea"), which is great for fuzzy recall but | ||
| * surfaces unwanted matches when the user knows exactly what they are looking | ||
| * for. Quoting opts out of that behavior. | ||
| * | ||
| * Returns the inner phrase when the query is a quoted exact phrase, otherwise | ||
| * null. An empty quoted string (`""`) is treated as not a phrase so the caller | ||
| * falls back to the regular path. | ||
| */ | ||
| export function parseExactPhrase(query: string): string | null { | ||
| if (query.length >= 2 && query.startsWith('"') && query.endsWith('"')) { | ||
| let inner = query.slice(1, -1); | ||
| return inner.length > 0 ? inner : null; | ||
| } | ||
| return null; | ||
| } | ||
|
|
||
| /** | ||
| * Full text search index backed by flexsearch. | ||
| * | ||
| * In addition to the default fuzzy token search, the index keeps the original | ||
| * text for each point so a quoted query can perform an exact, case-insensitive | ||
| * substring match. | ||
| */ | ||
| export class SearchIndex { | ||
| private index: Index; | ||
| private texts: Map<string | number, string>; | ||
|
|
||
| constructor() { | ||
| this.index = new Index(options); | ||
| this.texts = new Map(); | ||
| } | ||
|
|
||
| clear() { | ||
| this.index.clear(); | ||
| this.index.cleanup(); | ||
| this.index = new Index(options); | ||
| this.texts = new Map(); | ||
| } | ||
|
|
||
| addPoints(points: { id: string | number; text: string }[]) { | ||
| for (let p of points) { | ||
| this.index.add(p.id, p.text); | ||
| this.texts.set(p.id, p.text); | ||
| } | ||
| } | ||
|
|
||
| query(query: string, limit: number): (string | number)[] { | ||
| let phrase = parseExactPhrase(query); | ||
| if (phrase != null) { | ||
| return this.exactSearch(phrase, limit); | ||
| } | ||
| return this.index.search(query, { limit }); | ||
| } | ||
|
|
||
| private exactSearch(phrase: string, limit: number): (string | number)[] { | ||
| let needle = phrase.toLowerCase(); | ||
| let result: (string | number)[] = []; | ||
| for (let [id, text] of this.texts) { | ||
| if (text.toLowerCase().includes(needle)) { | ||
| result.push(id); | ||
| if (result.length >= limit) { | ||
| break; | ||
| } | ||
| } | ||
| } | ||
| return result; | ||
| } | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,70 @@ | ||
| // Copyright (c) 2025 Apple Inc. Licensed under MIT License. | ||
|
|
||
| import { describe, expect, test } from "vitest"; | ||
|
|
||
| import { parseExactPhrase, SearchIndex } from "../src/search/search_index.js"; | ||
|
|
||
| describe("parseExactPhrase", () => { | ||
| test("returns the inner phrase for a quoted query", () => { | ||
| expect(parseExactPhrase('"aldi"')).toBe("aldi"); | ||
| expect(parseExactPhrase('"new york"')).toBe("new york"); | ||
| }); | ||
|
|
||
| test("returns null for an unquoted query", () => { | ||
| expect(parseExactPhrase("aldi")).toBe(null); | ||
| expect(parseExactPhrase('aldi"')).toBe(null); | ||
| expect(parseExactPhrase('"aldi')).toBe(null); | ||
| }); | ||
|
|
||
| test("returns null for an empty quoted query", () => { | ||
| expect(parseExactPhrase('""')).toBe(null); | ||
| expect(parseExactPhrase('"')).toBe(null); | ||
| }); | ||
| }); | ||
|
|
||
| describe("SearchIndex exact-phrase search", () => { | ||
| // Mirrors the report in issue #137: the fuzzy encoder maps "aldi" and | ||
| // "aldea" to the same tokens, so a plain search for "aldi" surfaces | ||
| // "ALDEA HOMES" before the real "ALDI" rows. | ||
| const points = [ | ||
| { id: 1, text: "ALDEA HOMES" }, | ||
| { id: 2, text: "ALDEA HOMES TWO" }, | ||
| { id: 3, text: "ALDI Supermarket" }, | ||
| { id: 4, text: "Corner ALDI" }, | ||
| { id: 5, text: "Walmart" }, | ||
| ]; | ||
|
|
||
| test("the fuzzy search reproduces the false matches from the issue", () => { | ||
| let index = new SearchIndex(); | ||
| index.addPoints(points); | ||
| let result = index.query("aldi", 100); | ||
| // The fuzzy encoder returns the "ALDEA" rows alongside the real matches, | ||
| // which is the behavior the quoted search is meant to avoid. | ||
| expect(result).toContain(1); | ||
| expect(result).toContain(2); | ||
| }); | ||
|
|
||
| test("a quoted query matches only the exact substring, case-insensitively", () => { | ||
| let index = new SearchIndex(); | ||
| index.addPoints(points); | ||
| let result = index.query('"aldi"', 100); | ||
| expect(new Set(result)).toEqual(new Set([3, 4])); | ||
| expect(result).not.toContain(1); | ||
| expect(result).not.toContain(2); | ||
| expect(result).not.toContain(5); | ||
| }); | ||
|
|
||
| test("a quoted query respects the limit", () => { | ||
| let index = new SearchIndex(); | ||
| index.addPoints(points); | ||
| let result = index.query('"aldi"', 1); | ||
| expect(result.length).toBe(1); | ||
| }); | ||
|
|
||
| test("clear resets both the fuzzy index and the exact-match texts", () => { | ||
| let index = new SearchIndex(); | ||
| index.addPoints(points); | ||
| index.clear(); | ||
| expect(index.query('"aldi"', 100)).toEqual([]); | ||
| }); | ||
| }); |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it be simpler to check for longer than 2 above instead of longer or equal?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good catch, and your other comment about
"aldi" storepushed me to rethink this whole parse step. I replacedparseExactPhrasewith aparseQuerythat walks the string quote by quote, so the special-case length guard is gone entirely. A quoted run becomes a phrase, anything outside the quotes stays free text, and empty quotes just contribute nothing. That removed the>= 2vs> 2ambiguity you flagged here.