Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions web/src/engine/predictive-text/templates/src/tokenization.ts
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ export function tokenize(
currentIndex = nextIndex;
}

if(tokenization.left.length == 0) {
tokenization.left.push({text: '', isWhitespace: false});
}

// New step 2: handle any rejoins needed.

// Handle any desired special handling for directly-pre-caret scenarios - where for this
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,10 @@ export class ContextToken {
return this.exampleInput == '';
}

get codepointLength(): number {
return this.searchModule.codepointLength;
}

/**
* Denotes the original keystroke Transforms comprising the range corresponding
* to this token.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,7 @@ interface RetokenizedEdgeWindow extends EdgeWindow {

export interface ContextTokenLike {
exampleInput: string;
codepointLength: number;
isPartial?: boolean;
sourceRangeKey?: string;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ export function determineModelTokenizer(model: LexicalModel) {
if(model.wordbreaker) {
return models.tokenize(model.wordbreaker, context);
} else {
return null;
// Not ideal for pre-14.0 models, but it'll do for now.
return models.tokenize(wordBreakers.default, context);
}
}
}
Expand Down
204 changes: 104 additions & 100 deletions web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@ import { defaultWordbreaker, WordBreakProperty } from '@keymanapp/models-wordbre

import TransformUtils from './transformUtils.js';
import { determineModelTokenizer, determineModelWordbreaker, determinePunctuationFromModel } from './model-helpers.js';
import { ContextTokenization } from './correction/context-tokenization.js';
import { ContextTokenization, ContextTokenLike, mapWhitespacedTokenization } from './correction/context-tokenization.js';
import { ContextTracker } from './correction/context-tracker.js';
import { ContextState, determineContextSlideTransform } from './correction/context-state.js';
import { ContextTransition } from './correction/context-transition.js';
import { ExecutionTimer } from './correction/execution-timer.js';
import ModelCompositor from './model-compositor.js';
import { EDIT_DISTANCE_COST_SCALE, getBestTokenMatches } from './correction/distance-modeler.js';
import { TokenResult } from './correction/tokenization-corrector.js';

const searchForProperty = defaultWordbreaker.searchForProperty;

Expand All @@ -26,7 +27,6 @@ import Reversion = LexicalModelTypes.Reversion;
import Suggestion = LexicalModelTypes.Suggestion;
import SuggestionTag = LexicalModelTypes.SuggestionTag;
import Transform = LexicalModelTypes.Transform;
import { TokenResult } from './correction/tokenization-corrector.js';

/*
* The functions in this file exist to provide unit-testable stateless components for the
Expand Down Expand Up @@ -106,12 +106,6 @@ export type CorrectionPredictionTuple = {
preservationTransform?: Transform;
};

export interface ContextTokenLike {
exampleInput: string;
isPartial?: boolean;
sourceRangeKey?: string;
}

/**
* An enum to be used when categorizing the level of similarity between
* generated Suggestions and the actual text upon which a Suggestion is
Expand Down Expand Up @@ -159,88 +153,69 @@ export function tupleDisplayOrderSort(a: CorrectionPredictionTuple, b: Correctio
return b.totalProb - a.totalProb;
}

export async function correctAndEnumerateWithoutTraversals(
export function determineTraversallessCorrectionSequences(
lexicalModel: LexicalModel,
transformDistribution: Distribution<Transform>,
corrections: Distribution<Transform>,
context: Context
): Promise<{
/**
* For models that support correction-search caching, this provides the
* cached object corresponding to this method's operation.
*
* Otherwise, is `null`.
*/
postContextState?: ContextState;

/**
* The suggestions generated based on the user's input state.
*/
rawPredictions: CorrectionPredictionTuple[];
): PredictionParameters[] {
let returnedPredictionData: PredictionParameters[] = [];

/**
* The id of a prior ContextTransition event that triggered a Suggestion found
* at the end of the Context. Will be undefined if no edits have occurred
* since the Suggestion was applied.
*/
revertableTransitionId?: number
}> {
const inputTransform = transformDistribution[0].sample;
let rawPredictions: CorrectionPredictionTuple[] = [];
const tokenizer = determineModelTokenizer(lexicalModel);
const wordbreak = determineModelWordbreaker(lexicalModel);

let predictionRoots: ProbabilityMass<Transform>[];
const tokenization = tokenizer(context); // issue at present if no tokens exist!
const tokenMapper = (t: models.Token) => {
return {
exampleInput: t.text
} as ContextTokenLike;
}

// Only allow new-word suggestions if space was the most likely keypress.
const allowSpace = TransformUtils.isWhitespace(inputTransform);
const allowBksp = TransformUtils.isBackspace(inputTransform);
for(let correction of corrections) {
// Step 1: determine tokenization effects. We can't use the
// ContextTokenization pattern due to the model's lack of LexiconTraversal
// support, though.
const transformId = correction.sample.id;
const postContext = models.applyTransform(correction.sample, context);
const postTokenization = tokenizer(postContext);

const transitionEffects = determineSuggestionRange(tokenization.left.map(tokenMapper), postTokenization.left.map(tokenMapper), (a, b) => a.exampleInput == b.exampleInput);
const match: TokenResult = {
matchString: wordbreak(postContext),
inputSamplingCost: -Math.log(correction.p),
knownCost: 0,
totalCost: -Math.log(correction.p)
};

// Generates raw prediction distributions for each valid input. Can only 'correct'
// against the final input.
//
// This is the old, 12.0-13.0 'correction' style.
if(allowSpace) {
// Detect start of new word; prevent whitespace loss here.
predictionRoots = [{sample: inputTransform, p: 1.0}];
} else {
predictionRoots = transformDistribution.map((alt) => {
let transform = alt.sample;

// Filter out special keys unless they're expected.
if(TransformUtils.isWhitespace(transform) && !allowSpace) {
return null;
} else if(TransformUtils.isBackspace(transform) && !allowBksp) {
return null;
const suggestionParams = buildCorrectionSequence(transitionEffects, context, match, 1);

// // determineSuggestionRange?
// // - can we abstractify it to not need spaceID ordering?
// // - it should never be the case that the lead token for both is not found in the other (unless whole replacement or mismatch)
// // - then, iterate the section that matches perfectly.
const tokenizationMapping = mapWhitespacedTokenization(tokenization.left.map((t) => { return {exampleInput: t.text, codepointLength: KMWString.length(t.text)} }), lexicalModel, correction.sample);
const tokenizedCorrection = tokenizationMapping.tokenizedTransform;
const tokenizedCorrectionEntries = [...tokenizedCorrection.values()];
const { tokensToRemove, tokensToPredict } = transitionEffects;
const deleteLeft = tokensToPredict.length > 1 ? 0 : tokensToRemove.reduce((prev, curr) => prev + curr.codepointLength, 0);

// IF: array has multiple entries, then build the preservation-transform as below, including the deleteLeft.
// If not, don't make one!
const preservationTransform = tokenizedCorrectionEntries.slice(0, -1).reduce((accum, curr) => {
return models.buildMergedTransform(accum, {...curr, deleteLeft: 0});
}, { insert: '', deleteLeft, id: correction.sample.id});

returnedPredictionData.push({
...suggestionParams,
applyInPost: (p) => {
p.preservationTransform = preservationTransform;
if(transformId) {
p.prediction.sample.transformId = transformId;
}
}

return alt;
});
}

const wordbreak = determineModelWordbreaker(lexicalModel);
// Remove `null` entries, then determine suggestions.
predictionRoots.forEach((pr) => {
const postContext = models.applyTransform(pr.sample, context);
const tailTokenText = wordbreak(postContext);
const rootContext = models.applyTransform({insert: '', deleteLeft: KMWString.length(tailTokenText)}, postContext);

const results = predictFromCorrectionSequence(lexicalModel, [{
sample: {
insert: tailTokenText,
deleteLeft: 0,
id: pr.sample.id
},
p: pr.p
}], rootContext);
results.forEach((r) => rawPredictions.push(r));
})

if(allowSpace) {
rawPredictions.forEach((entry) => entry.preservationTransform = inputTransform);
})
}

return {
postContextState: null,
rawPredictions: rawPredictions
};
return returnedPredictionData;
}

/**
Expand Down Expand Up @@ -433,6 +408,37 @@ export interface PredictionParameters {
applyInPost: (entry: CorrectionPredictionTuple) => void
}

export function buildCorrectionSequence(
transitionEffects: ReturnType<typeof determineSuggestionRange>,
context: Context,
match: Readonly<TokenResult>,
costFactor: number
) {
const { tokensToPredict, tokensToRemove, extendsRoot } = transitionEffects;
const deleteLeft = (tokensToPredict.length > 1 && !extendsRoot)
? (tokensToRemove[tokensToRemove.length - 1]?.codepointLength ?? 0)
: tokensToRemove.reduce((prev, curr) => prev + curr.codepointLength, 0);

const rootContext = models.applyTransform({insert: '', deleteLeft}, context);

// Replace the existing context with the correction.
const correctionTransform: Transform = {
insert: match.matchString, // insert correction string
deleteLeft: 0,
}

const rootCost = match.totalCost;
const predictionRoot = {
sample: correctionTransform,
p: Math.exp(-rootCost * costFactor)
};

return {
rootContext,
tokenizedCorrection: [predictionRoot]
};
}

/**
* This function takes in metadata about generated corrections (for models that
* implement Traversals) and uses that to produce the corresponding parameters
Expand All @@ -447,37 +453,28 @@ export interface PredictionParameters {
* building prediction probabilities.
* @returns
*/
export function determineTokenizedCorrectionSequence(
export function determineTokenizedCorrectionSequence( //
transition: ContextTransition,
tokenization: ContextTokenization,
match: Readonly<TokenResult>,
costFactor: number
): PredictionParameters {
const applicationTarget = transition.base.displayTokenization;
const { tokensToRemove, tokensToPredict } = determineSuggestionRange(applicationTarget.tokens, tokenization.tokens, (a, b) => a.spaceId == b.spaceId);
const transitionParams = determineSuggestionRange(applicationTarget.tokens, tokenization.tokens, (a, b) => a.spaceId == b.spaceId);

const deleteLeft = tokensToPredict.length > 1 ? 0 : tokensToRemove.reduce((prev, curr) => prev + curr.searchModule.codepointLength, 0);
const rootContext = models.applyTransform({insert: '', deleteLeft}, transition.base.context);

// Replace the existing context with the correction.
const correctionTransform: Transform = {
insert: match.matchString, // insert correction string
deleteLeft: 0,
}
const suggestionParams = buildCorrectionSequence(transitionParams, transition.base.context, match, costFactor);

if(transition.transitionId) {
correctionTransform.id = transition.transitionId // The correction should always be based on the most recent external transform/transcription ID.
suggestionParams.tokenizedCorrection.map((t) => t.sample.id = transition.transitionId); // The correction should always be based on the most recent external transform/transcription ID.
}

const rootCost = match.totalCost;
const predictionRoot = {
sample: correctionTransform,
p: Math.exp(-rootCost * costFactor)
};
const { tokensToPredict, tokensToRemove } = transitionParams;
const deleteLeft = tokensToPredict.length > 1
? tokensToRemove[tokensToRemove.length - 1]?.codepointLength ?? 0
: tokensToRemove.reduce((prev, curr) => prev + curr.codepointLength, 0);

return {
rootContext,
tokenizedCorrection: [predictionRoot],
...suggestionParams,
applyInPost: (entry: CorrectionPredictionTuple) => {
entry.preservationTransform = tokenization.taillessTrueKeystroke;
// // Will need an extra lookup layer if the suggestion is generated from within a cluster.
Expand Down Expand Up @@ -530,7 +527,14 @@ export async function correctAndEnumerate(
// It's mostly here to support models compiled before Keyman 14.0, which was
// when the `LexiconTraversal` pattern was established.
if(!contextTracker) {
return correctAndEnumerateWithoutTraversals(lexicalModel, transformDistribution, context);
const predictionData = determineTraversallessCorrectionSequences(lexicalModel, transformDistribution, context);
return {
rawPredictions: predictionData.flatMap((entry) => {
const predictions = predictFromCorrectionSequence(lexicalModel, entry.tokenizedCorrection, entry.rootContext);
predictions.forEach((p) => entry.applyInPost(p));
return predictions;
})
};
}

// 'else': the current, 14.0+ pattern, which is able to leverage
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ describe('Tokenization functions', function() {
});

it('properly handles empty-context cases', function() {
// Wordbreaking on a empty space => no word.
// Wordbreaking on a empty space => no word, but empty initial token.
let context = {
left: '', startOfBuffer: true,
right: '', endOfBuffer: true
Expand All @@ -184,7 +184,7 @@ describe('Tokenization functions', function() {
let tokenization = models.tokenize(wordBreakers.default, context);

let expectedResult: models.Tokenization = {
left: [],
left: [{text: '', isWhitespace: false}],
right: [],
caretSplitsToken: false
};
Expand All @@ -193,11 +193,11 @@ describe('Tokenization functions', function() {
});

it('properly handles null context cases', function() {
// Wordbreaking on a empty space => no word.
// Wordbreaking on a empty space => no word, but empty initial token.
let tokenization = models.tokenize(wordBreakers.default, null);

let expectedResult: models.Tokenization = {
left: [],
left: [{text: '', isWhitespace: false}],
right: [],
caretSplitsToken: false
};
Expand Down
Loading
Loading