keymanapp · jahorton · May 6, 2026 · May 8, 2026 · May 21, 2026
diff --git a/web/src/engine/predictive-text/templates/src/tokenization.ts b/web/src/engine/predictive-text/templates/src/tokenization.ts
@@ -95,6 +95,10 @@ export function tokenize(
     currentIndex = nextIndex;
   }
 
+  if(tokenization.left.length == 0) {
+    tokenization.left.push({text: '', isWhitespace: false});
+  }
+
   // New step 2: handle any rejoins needed.
 
   // Handle any desired special handling for directly-pre-caret scenarios - where for this

diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts
@@ -129,6 +129,10 @@ export class ContextToken {
     return this.exampleInput == '';
   }
 
+  get codepointLength(): number {
+    return this.searchModule.codepointLength;
+  }
+
   /**
    * Denotes the original keystroke Transforms comprising the range corresponding
    * to this token.

diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts
@@ -614,6 +614,7 @@ interface RetokenizedEdgeWindow extends EdgeWindow {
 
 export interface ContextTokenLike {
   exampleInput: string;
+  codepointLength: number;
   isPartial?: boolean;
   sourceRangeKey?: string;
 }

diff --git a/web/src/engine/predictive-text/worker-thread/src/main/model-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/model-helpers.ts
@@ -71,7 +71,8 @@ export function determineModelTokenizer(model: LexicalModel) {
     if(model.wordbreaker) {
       return models.tokenize(model.wordbreaker, context);
     } else {
-      return null;
+      // Not ideal for pre-14.0 models, but it'll do for now.
+      return models.tokenize(wordBreakers.default, context);
     }
   }
 }

diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts
@@ -5,13 +5,14 @@ import { defaultWordbreaker, WordBreakProperty } from '@keymanapp/models-wordbre
 
 import TransformUtils from './transformUtils.js';
 import { determineModelTokenizer, determineModelWordbreaker, determinePunctuationFromModel } from './model-helpers.js';
-import { ContextTokenization } from './correction/context-tokenization.js';
+import { ContextTokenization, ContextTokenLike, mapWhitespacedTokenization } from './correction/context-tokenization.js';
 import { ContextTracker } from './correction/context-tracker.js';
 import { ContextState, determineContextSlideTransform } from './correction/context-state.js';
 import { ContextTransition } from './correction/context-transition.js';
 import { ExecutionTimer } from './correction/execution-timer.js';
 import ModelCompositor from './model-compositor.js';
 import { EDIT_DISTANCE_COST_SCALE, getBestTokenMatches } from './correction/distance-modeler.js';
+import { TokenResult } from './correction/tokenization-corrector.js';
 
 const searchForProperty = defaultWordbreaker.searchForProperty;
 
@@ -26,7 +27,6 @@ import Reversion = LexicalModelTypes.Reversion;
 import Suggestion = LexicalModelTypes.Suggestion;
 import SuggestionTag = LexicalModelTypes.SuggestionTag;
 import Transform = LexicalModelTypes.Transform;
-import { TokenResult } from './correction/tokenization-corrector.js';
 
 /*
  * The functions in this file exist to provide unit-testable stateless components for the
@@ -106,12 +106,6 @@ export type CorrectionPredictionTuple = {
   preservationTransform?: Transform;
 };
 
-export interface ContextTokenLike {
-  exampleInput: string;
-  isPartial?: boolean;
-  sourceRangeKey?: string;
-}
-
 /**
  * An enum to be used when categorizing the level of similarity between
  * generated Suggestions and the actual text upon which a Suggestion is
@@ -159,88 +153,69 @@ export function tupleDisplayOrderSort(a: CorrectionPredictionTuple, b: Correctio
   return b.totalProb - a.totalProb;
 }
 
-export async function correctAndEnumerateWithoutTraversals(
+export function determineTraversallessCorrectionSequences(
   lexicalModel: LexicalModel,
-  transformDistribution: Distribution<Transform>,
+  corrections: Distribution<Transform>,
   context: Context
-): Promise<{
-  /**
-   * For models that support correction-search caching, this provides the
-   * cached object corresponding to this method's operation.
-   *
-   * Otherwise, is `null`.
-   */
-  postContextState?: ContextState;
-
-  /**
-   * The suggestions generated based on the user's input state.
-   */
-  rawPredictions: CorrectionPredictionTuple[];
+): PredictionParameters[] {
+  let returnedPredictionData: PredictionParameters[] = [];
 
-  /**
-   * The id of a prior ContextTransition event that triggered a Suggestion found
-   * at the end of the Context.  Will be undefined if no edits have occurred
-   * since the Suggestion was applied.
-   */
-  revertableTransitionId?: number
-}> {
-  const inputTransform = transformDistribution[0].sample;
-  let rawPredictions: CorrectionPredictionTuple[] = [];
+  const tokenizer = determineModelTokenizer(lexicalModel);
+  const wordbreak = determineModelWordbreaker(lexicalModel);
 
-  let predictionRoots: ProbabilityMass<Transform>[];
+  const tokenization = tokenizer(context); // issue at present if no tokens exist!
+  const tokenMapper = (t: models.Token) => {
+    return {
+      exampleInput: t.text
+    } as ContextTokenLike;
+  }
 
-  // Only allow new-word suggestions if space was the most likely keypress.
-  const allowSpace = TransformUtils.isWhitespace(inputTransform);
-  const allowBksp = TransformUtils.isBackspace(inputTransform);
+  for(let correction of corrections) {
+    // Step 1:  determine tokenization effects.  We can't use the
+    // ContextTokenization pattern due to the model's lack of LexiconTraversal
+    // support, though.
+    const transformId = correction.sample.id;
+    const postContext = models.applyTransform(correction.sample, context);
+    const postTokenization = tokenizer(postContext);
+
+    const transitionEffects = determineSuggestionRange(tokenization.left.map(tokenMapper), postTokenization.left.map(tokenMapper), (a, b) => a.exampleInput == b.exampleInput);
+    const match: TokenResult = {
+      matchString: wordbreak(postContext),
+      inputSamplingCost: -Math.log(correction.p),
+      knownCost: 0,
+      totalCost: -Math.log(correction.p)
+    };
 
-  // Generates raw prediction distributions for each valid input.  Can only 'correct'
-  // against the final input.
-  //
-  // This is the old, 12.0-13.0 'correction' style.
-  if(allowSpace) {
-    // Detect start of new word; prevent whitespace loss here.
-    predictionRoots = [{sample: inputTransform, p: 1.0}];
-  } else {
-    predictionRoots = transformDistribution.map((alt) => {
-      let transform = alt.sample;
-
-      // Filter out special keys unless they're expected.
-      if(TransformUtils.isWhitespace(transform) && !allowSpace) {
-        return null;
-      } else if(TransformUtils.isBackspace(transform) && !allowBksp) {
-        return null;
+    const suggestionParams = buildCorrectionSequence(transitionEffects, context, match, 1);
+
+    // // determineSuggestionRange?
+    // // - can we abstractify it to not need spaceID ordering?
+    // // - it should never be the case that the lead token for both is not found in the other (unless whole replacement or mismatch)
+    // // - then, iterate the section that matches perfectly.
+    const tokenizationMapping = mapWhitespacedTokenization(tokenization.left.map((t) => { return {exampleInput: t.text, codepointLength: KMWString.length(t.text)} }), lexicalModel, correction.sample);
+    const tokenizedCorrection = tokenizationMapping.tokenizedTransform;
+    const tokenizedCorrectionEntries = [...tokenizedCorrection.values()];
+    const { tokensToRemove, tokensToPredict } = transitionEffects;
+    const deleteLeft = tokensToPredict.length > 1 ? 0 : tokensToRemove.reduce((prev, curr) => prev + curr.codepointLength, 0);
+
+    // IF:  array has multiple entries, then build the preservation-transform as below, including the deleteLeft.
+    // If not, don't make one!
+    const preservationTransform = tokenizedCorrectionEntries.slice(0, -1).reduce((accum, curr) => {
+      return models.buildMergedTransform(accum, {...curr, deleteLeft: 0});
+    }, { insert: '', deleteLeft, id: correction.sample.id});
+
+    returnedPredictionData.push({
+      ...suggestionParams,
+      applyInPost: (p) => {
+        p.preservationTransform = preservationTransform;
+        if(transformId) {
+          p.prediction.sample.transformId = transformId;
+        }
       }
-
-      return alt;
-    });
-  }
-
-  const wordbreak = determineModelWordbreaker(lexicalModel);
-  // Remove `null` entries, then determine suggestions.
-  predictionRoots.forEach((pr) => {
-    const postContext = models.applyTransform(pr.sample, context);
-    const tailTokenText = wordbreak(postContext);
-    const rootContext = models.applyTransform({insert: '', deleteLeft: KMWString.length(tailTokenText)}, postContext);
-
-    const results = predictFromCorrectionSequence(lexicalModel, [{
-      sample: {
-        insert: tailTokenText,
-        deleteLeft: 0,
-        id: pr.sample.id
-      },
-      p: pr.p
-    }], rootContext);
-    results.forEach((r) => rawPredictions.push(r));
-  })
-
-  if(allowSpace) {
-    rawPredictions.forEach((entry) => entry.preservationTransform = inputTransform);
+    })
   }
 
-  return {
-    postContextState: null,
-    rawPredictions: rawPredictions
-  };
+  return returnedPredictionData;
 }
 
 /**
@@ -433,6 +408,37 @@ export interface PredictionParameters {
   applyInPost: (entry: CorrectionPredictionTuple) => void
 }
 
+export function buildCorrectionSequence(
+  transitionEffects: ReturnType<typeof determineSuggestionRange>,
+  context: Context,
+  match: Readonly<TokenResult>,
+  costFactor: number
+) {
+  const { tokensToPredict, tokensToRemove, extendsRoot } = transitionEffects;
+  const deleteLeft = (tokensToPredict.length > 1 && !extendsRoot)
+    ? (tokensToRemove[tokensToRemove.length - 1]?.codepointLength ?? 0)
+    : tokensToRemove.reduce((prev, curr) => prev + curr.codepointLength, 0);
+
+  const rootContext = models.applyTransform({insert: '', deleteLeft}, context);
+
+  // Replace the existing context with the correction.
+  const correctionTransform: Transform = {
+    insert: match.matchString,  // insert correction string
+    deleteLeft: 0,
+  }
+
+  const rootCost = match.totalCost;
+  const predictionRoot = {
+    sample: correctionTransform,
+    p: Math.exp(-rootCost * costFactor)
+  };
+
+  return {
+    rootContext,
+    tokenizedCorrection: [predictionRoot]
+  };
+}
+
 /**
  * This function takes in metadata about generated corrections (for models that
  * implement Traversals) and uses that to produce the corresponding parameters
@@ -447,37 +453,28 @@ export interface PredictionParameters {
  * building prediction probabilities.
  * @returns
  */
-export function determineTokenizedCorrectionSequence(
+export function determineTokenizedCorrectionSequence( //
   transition: ContextTransition,
   tokenization: ContextTokenization,
   match: Readonly<TokenResult>,
   costFactor: number
 ): PredictionParameters {
   const applicationTarget = transition.base.displayTokenization;
-  const { tokensToRemove, tokensToPredict } = determineSuggestionRange(applicationTarget.tokens, tokenization.tokens, (a, b) => a.spaceId == b.spaceId);
+  const transitionParams = determineSuggestionRange(applicationTarget.tokens, tokenization.tokens, (a, b) => a.spaceId == b.spaceId);
 
-  const deleteLeft = tokensToPredict.length > 1 ? 0 : tokensToRemove.reduce((prev, curr) => prev + curr.searchModule.codepointLength, 0);
-  const rootContext = models.applyTransform({insert: '', deleteLeft}, transition.base.context);
-
-  // Replace the existing context with the correction.
-  const correctionTransform: Transform = {
-    insert: match.matchString,  // insert correction string
-    deleteLeft: 0,
-  }
+  const suggestionParams = buildCorrectionSequence(transitionParams, transition.base.context, match, costFactor);
 
   if(transition.transitionId) {
-    correctionTransform.id = transition.transitionId // The correction should always be based on the most recent external transform/transcription ID.
+    suggestionParams.tokenizedCorrection.map((t) => t.sample.id = transition.transitionId); // The correction should always be based on the most recent external transform/transcription ID.
   }
 
-  const rootCost = match.totalCost;
-  const predictionRoot = {
-    sample: correctionTransform,
-    p: Math.exp(-rootCost * costFactor)
-  };
+  const { tokensToPredict, tokensToRemove } = transitionParams;
+  const deleteLeft = tokensToPredict.length > 1
+    ? tokensToRemove[tokensToRemove.length - 1]?.codepointLength ?? 0
+    : tokensToRemove.reduce((prev, curr) => prev + curr.codepointLength, 0);
 
   return {
-    rootContext,
-    tokenizedCorrection: [predictionRoot],
+    ...suggestionParams,
     applyInPost: (entry: CorrectionPredictionTuple) => {
       entry.preservationTransform = tokenization.taillessTrueKeystroke;
       // // Will need an extra lookup layer if the suggestion is generated from within a cluster.
@@ -530,7 +527,14 @@ export async function correctAndEnumerate(
   // It's mostly here to support models compiled before Keyman 14.0, which was
   // when the `LexiconTraversal` pattern was established.
   if(!contextTracker) {
-    return correctAndEnumerateWithoutTraversals(lexicalModel, transformDistribution, context);
+    const predictionData = determineTraversallessCorrectionSequences(lexicalModel, transformDistribution, context);
+    return {
+      rawPredictions: predictionData.flatMap((entry) => {
+        const predictions = predictFromCorrectionSequence(lexicalModel, entry.tokenizedCorrection, entry.rootContext);
+        predictions.forEach((p) => entry.applyInPost(p));
+        return predictions;
+      })
+    };
   }
 
   // 'else':  the current, 14.0+ pattern, which is able to leverage

diff --git a/web/src/test/auto/headless/engine/predictive-text/templates/tokenization.tests.ts b/web/src/test/auto/headless/engine/predictive-text/templates/tokenization.tests.ts
@@ -175,7 +175,7 @@ describe('Tokenization functions', function() {
     });
 
     it('properly handles empty-context cases', function() {
-      // Wordbreaking on a empty space => no word.
+      // Wordbreaking on a empty space => no word, but empty initial token.
       let context = {
         left: '', startOfBuffer: true,
         right: '', endOfBuffer: true
@@ -184,7 +184,7 @@ describe('Tokenization functions', function() {
       let tokenization = models.tokenize(wordBreakers.default, context);
 
       let expectedResult: models.Tokenization = {
-        left: [],
+        left: [{text: '', isWhitespace: false}],
         right: [],
         caretSplitsToken: false
       };
@@ -193,11 +193,11 @@ describe('Tokenization functions', function() {
     });
 
     it('properly handles null context cases', function() {
-      // Wordbreaking on a empty space => no word.
+      // Wordbreaking on a empty space => no word, but empty initial token.
       let tokenization = models.tokenize(wordBreakers.default, null);
 
       let expectedResult: models.Tokenization = {
-        left: [],
+        left: [{text: '', isWhitespace: false}],
         right: [],
         caretSplitsToken: false
       };